blob: 8f5f8e3d07a34dd65c13fc00e9997aa154194cfe [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif0681e3b2019-04-25 14:28:07 +010024#include "gemm_helpers.h"
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +000025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010027#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +000028#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
29#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
30#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
31#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
32#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
33#define CONCAT_INC(K0) INC##K0
34#define INC(K0) CONCAT_INC(K0)
35
36#if(SRC_WIDTH % K0)
37#define BOUNDARY_CONDITION_X(x, a) \
38 ({ \
39 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
40 })
41#else // (SRC_WIDTH % K0)
42#define BOUNDARY_CONDITION_X(x, a) \
43 ({})
44#endif // (SRC_WIDTH % K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000045
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010046#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
47 ({ \
48 if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
49 { \
50 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
51 { \
52 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
53 } \
54 else \
55 { \
56 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
57 } \
58 } \
59 else \
60 { \
61 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
62 { \
63 LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
64 } \
65 else \
66 { \
67 LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
68 } \
69 } \
70 })
71
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000072/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
73 * the output matrix unrolling the values.
74 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010075 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
76 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010077 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010078 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
79 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010080 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
81 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000082 * @note Only the following values for M0, K0 and V0 are supported:
83 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +000084 * K0: 2,3,4,8,16
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000085 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010086 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000087 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
88 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
89 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
90 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
91 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
92 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010093 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000094 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
95 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
96 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
97 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
98 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
99 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
100 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
101 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
102 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
103 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
104 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
105 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
106 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
107 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
108 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
109 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
110 */
111__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
112 TENSOR3D_DECLARATION(dst)
113#if defined(REINTERPRET_INPUT_AS_3D)
114 ,
115 uint cross_plane_pad
116#endif // REINTERPRET_INPUT_AS_3D
117 )
118{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000119 // Block size
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000120#define BLOCK_SIZE ((M0) * (K0))
121
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000122 // Output offset X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000123#if defined(INTERLEAVE)
124#define OUTPUT_OFFSET_X (K0)
125#else // defined(INTERLEAVE)
126#define OUTPUT_OFFSET_X (BLOCK_SIZE)
127#endif // defined(INTERLEAVE)
128
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000129 // Output step X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000130#if defined(INTERLEAVE)
131#define OUTPUT_STEP_X (K0) * (V0)
132#else // Do not interleave
133#define OUTPUT_STEP_X (K0)
134#endif // defined(INTERLEAVE)
135
136 // Compute source and destination addresses
137 uint x = get_global_id(0);
138 uint y = get_global_id(1);
139 uint z = get_global_id(2);
140
141 // ------------------ Compute input/output addresses ---------------------------
142
143 // Compute the input address
144 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
145
146 // Compute the output address
147 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
148 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
149
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000150 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
151 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000152
153#if defined(REINTERPRET_INPUT_AS_3D)
154 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
155 // multiply src_stride_z by DEPTH_GEMM3D
156
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000157 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
158
159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100160 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000161
162#else // defined(REINTERPRET_INPUT_AS_3D)
163
164 input_ptr += z * (uint)src_stride_z;
165
166#endif // defined(REINTERPRET_INPUT_AS_3D)
167
168 // Add offset for batched GEMM
169 output_ptr += z * (uint)dst_stride_z;
170
171 // ---------------------------Load input values --------------------------------
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000172 // Load values from the LHS matrix
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100173 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
174
175 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
176
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000177 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100178 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
179 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000180
181#undef BLOCK_SIZE
182#undef OUTPUT_OFFSET_X
183#undef OUTPUT_STEP_X
184}
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000185
186#if M0 == 2
187#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
188 ({ \
189 VEC_DATA_TYPE(DATA_TYPE, M0) \
190 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
191 VSTORE(M0) \
192 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
193 })
194#elif M0 == 3 // M0 == 3
195#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
196 ({ \
197 VEC_DATA_TYPE(DATA_TYPE, M0) \
198 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
199 VSTORE(M0) \
200 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
201 })
202#elif M0 == 4 // M0 == 4
203#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
204 ({ \
205 VEC_DATA_TYPE(DATA_TYPE, M0) \
206 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
207 VSTORE(M0) \
208 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
209 })
210#elif M0 == 5 // M0 == 5
211#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
212 ({ \
213 VEC_DATA_TYPE(DATA_TYPE, 4) \
214 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
215 DATA_TYPE res1 = a4.s##i; \
216 VSTORE(4) \
217 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
218 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
219 })
220#elif M0 == 6 // M0 == 6
221#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
222 ({ \
223 VEC_DATA_TYPE(DATA_TYPE, 4) \
224 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
225 VEC_DATA_TYPE(DATA_TYPE, 2) \
226 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
227 VSTORE(4) \
228 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
229 VSTORE(2) \
230 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
231 })
232#elif M0 == 7 // M0 == 7
233#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
234 ({ \
235 VEC_DATA_TYPE(DATA_TYPE, 4) \
236 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
237 VEC_DATA_TYPE(DATA_TYPE, 3) \
238 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
239 VSTORE(4) \
240 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
241 VSTORE(3) \
242 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
243 })
244#elif M0 == 8 // M0 == 8
245#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
246 ({ \
247 VEC_DATA_TYPE(DATA_TYPE, M0) \
248 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
249 VSTORE(M0) \
250 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
251 })
252#else // M0 not supported
253#error "M0 value not supported"
254#endif // N0 conditions
255
256/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
257 * the output matrix unrolling the values.
258 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100259 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
260 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100261 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100262 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
263 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100264 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
265 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000266 * @note Only the following values for M0, K0 and V0 are supported:
267 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000268 * K0: 2,3,4,8,16
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000269 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100270 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000271 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
272 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
273 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
274 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
275 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
276 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100277 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000278 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
279 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
281 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
283 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
284 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
285 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
286 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
287 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
289 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
291 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
292 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
293 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
294 */
295__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
296 TENSOR3D_DECLARATION(dst)
297#if defined(REINTERPRET_INPUT_AS_3D)
298 ,
299 uint cross_plane_pad
300#endif // REINTERPRET_INPUT_AS_3D
301 )
302{
303 // Block size
304#define BLOCK_SIZE ((M0) * (K0))
305
306 // Output offset X
307#if defined(INTERLEAVE)
308#define OUTPUT_OFFSET_X (M0)
309#else // defined(INTERLEAVE)
310#define OUTPUT_OFFSET_X (BLOCK_SIZE)
311#endif // defined(INTERLEAVE)
312
313 // Output step X
314#if defined(INTERLEAVE)
315#define OUTPUT_STEP_X (M0) * (V0)
316#else // Do not interleave
317#define OUTPUT_STEP_X (M0)
318#endif // defined(INTERLEAVE)
319
320 // Compute source and destination addresses
321 uint x = get_global_id(0);
322 uint y = get_global_id(1);
323 uint z = get_global_id(2);
324
325 // ------------------ Compute input/output addresses ---------------------------
326
327 // Compute the input address
328 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
329
330 // Compute the output address
331 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
332 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
333
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000334 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
335 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000336
337#if defined(REINTERPRET_INPUT_AS_3D)
338 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
339 // multiply src_stride_z by DEPTH_GEMM3D
340
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000341 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
342
343 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100344 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000345
346#else // defined(REINTERPRET_INPUT_AS_3D)
347
348 input_ptr += z * (uint)src_stride_z;
349
350#endif // defined(REINTERPRET_INPUT_AS_3D)
351
352 // Add offset for batched GEMM
353 output_ptr += z * (uint)dst_stride_z;
354
355 // ---------------------------Load input values --------------------------------
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100356 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000357
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100358 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
359
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000360 // ---------------------------Transpose and store block -----------------------
361
362 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
363 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
364#if K0 > 2
365 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000366#endif // K0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000367#if K0 > 3
368 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
369#endif // K0 > 3
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000370#if K0 > 4
371 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
372 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
374 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
375#endif // K0 > 4
376#if K0 > 8
377 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
378 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
383 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
384 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
385#endif // K0 > 8
386
387#undef BLOCK_SIZE
388#undef OUTPUT_OFFSET_X
389#undef OUTPUT_STEP_X
390}
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100391#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000392
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000393#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
394/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
395 * the output matrix unrolling the values.
396 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100397 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
398 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
399 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
400 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000401 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
402 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000403 * N0: 2,3,4,8,16
404 * K0: 1,2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000405 * H0: greater than 0
406 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100407 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000408 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
409 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
410 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
411 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
412 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
413 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
414 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
415 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
416 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
417 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
419 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
421 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
423 */
424__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
425 TENSOR3D_DECLARATION(dst))
426{
427 // Block size
428#define BLOCK_SIZE ((K0) * (N0))
429
430 // Output offset X
431#if defined(INTERLEAVE)
432#define OUTPUT_OFFSET_X (N0)
433#else // defined(INTERLEAVE)
434#define OUTPUT_OFFSET_X (BLOCK_SIZE)
435#endif // defined(INTERLEAVE)
436
437 // Output step X
438#if defined(INTERLEAVE)
439#define OUTPUT_STEP_X (N0) * (H0)
440#else // Do not interleave
441#define OUTPUT_STEP_X (N0)
442#endif // defined(INTERLEAVE)
443
444 // Compute source and destination addresses
445 uint x = get_global_id(0);
446 uint y = get_global_id(1);
447 uint z = get_global_id(2);
448
449 // ------------------ Compute input/output addresses ---------------------------
450
451 // Compute the input address
452 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
453
454 // Compute the output address
455 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
456 x / (uint)H0)
457 * (uint)dst_stride_y)
458 + z * (uint)dst_stride_z;
459
460 // ---------------------------Load input values --------------------------------
461
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000462 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000463
464 // Load values from the RHS matrix
465 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
466#if K0 > 1
467 if(y * (uint)K0 + 1 < SRC_HEIGHT)
468 {
469 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
470 }
471#endif // K0 > 1
472#if K0 > 2
473 if(y * (uint)K0 + 2 < SRC_HEIGHT)
474 {
475 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
476 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000477#endif // K0 > 2
478#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000479 if(y * (uint)K0 + 3 < SRC_HEIGHT)
480 {
481 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
482 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000483#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000484#if K0 > 4
485 if(y * (uint)K0 + 4 < SRC_HEIGHT)
486 {
487 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
488 }
489 if(y * (uint)K0 + 5 < SRC_HEIGHT)
490 {
491 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
492 }
493 if(y * (uint)K0 + 6 < SRC_HEIGHT)
494 {
495 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
496 }
497 if(y * (uint)K0 + 7 < SRC_HEIGHT)
498 {
499 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
500 }
501#endif // K0 > 4
502#if K0 > 8
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000503 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000504 {
505 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
506 }
507 if(y * (uint)K0 + 9 < SRC_HEIGHT)
508 {
509 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
510 }
511 if(y * (uint)K0 + 10 < SRC_HEIGHT)
512 {
513 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
514 }
515 if(y * (uint)K0 + 11 < SRC_HEIGHT)
516 {
517 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
518 }
519 if(y * (uint)K0 + 12 < SRC_HEIGHT)
520 {
521 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
522 }
523 if(y * (uint)K0 + 13 < SRC_HEIGHT)
524 {
525 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
526 }
527 if(y * (uint)K0 + 14 < SRC_HEIGHT)
528 {
529 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
530 }
531 if(y * (uint)K0 + 15 < SRC_HEIGHT)
532 {
533 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
534 }
535#endif // K0 > 8
536
537 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100538 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
539 STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000540
541#undef BLOCK_SIZE
542#undef OUTPUT_OFFSET_X
543#undef OUTPUT_STEP_X
544}
545
546#if defined(TRANSPOSE)
547/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
548 * the output matrix unrolling the values.
549 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100550 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
551 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
552 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
553 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000554 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
555 * @note The option -DTRANSPOSE must passed at compile time.
556 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000557 * N0: 2,3,4,8,16
558 * K0: 2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000559 * H0: greater than 0
560 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100561 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000562 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
563 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
564 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
565 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
566 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
567 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
569 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
574 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
575 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
576 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
577 */
578__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
579 TENSOR3D_DECLARATION(dst))
580{
581 // Block size
582#define BLOCK_SIZE ((K0) * (N0))
583
584 // Output offset X
585#if defined(INTERLEAVE)
586#define OUTPUT_OFFSET_X (K0)
587#else // defined(INTERLEAVE)
588#define OUTPUT_OFFSET_X (BLOCK_SIZE)
589#endif // defined(INTERLEAVE)
590
591 // Output step X
592#if defined(INTERLEAVE)
593#define OUTPUT_STEP_X (K0) * (H0)
594#else // Do not interleave
595#define OUTPUT_STEP_X (K0)
596#endif // defined(INTERLEAVE)
597
598 // Compute source and destination addresses
599 uint x = get_global_id(0);
600 uint y = get_global_id(1);
601 uint z = get_global_id(2);
602
603 // ------------------ Compute input/output addresses ---------------------------
604
605 // Compute the input address
606 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
607
608 // Compute the output address
609 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
610 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
611
612 // ---------------------------Load input values --------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000613 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000614
615 // Load values from the RHS matrix
616 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
617 if(y * (uint)K0 + 1 < SRC_HEIGHT)
618 {
619 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
620 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000621#if K0 > 2
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000622 if(y * (uint)K0 + 2 < SRC_HEIGHT)
623 {
624 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
625 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000626#endif // K0 > 2
627#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000628 if(y * (uint)K0 + 3 < SRC_HEIGHT)
629 {
630 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
631 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000632#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000633#if K0 > 4
634 if(y * (uint)K0 + 4 < SRC_HEIGHT)
635 {
636 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
637 }
638 if(y * (uint)K0 + 5 < SRC_HEIGHT)
639 {
640 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
641 }
642 if(y * (uint)K0 + 6 < SRC_HEIGHT)
643 {
644 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
645 }
646 if(y * (uint)K0 + 7 < SRC_HEIGHT)
647 {
648 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
649 }
650#endif // K0 > 4
651#if K0 > 8
Gian Marco Iodice89124342018-12-19 14:17:22 +0000652 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000653 {
654 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
655 }
656 if(y * (uint)K0 + 9 < SRC_HEIGHT)
657 {
658 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
659 }
660 if(y * (uint)K0 + 10 < SRC_HEIGHT)
661 {
662 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
663 }
664 if(y * (uint)K0 + 11 < SRC_HEIGHT)
665 {
666 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
667 }
668 if(y * (uint)K0 + 12 < SRC_HEIGHT)
669 {
670 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
671 }
672 if(y * (uint)K0 + 13 < SRC_HEIGHT)
673 {
674 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
675 }
676 if(y * (uint)K0 + 14 < SRC_HEIGHT)
677 {
678 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
679 }
680 if(y * (uint)K0 + 15 < SRC_HEIGHT)
681 {
682 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
683 }
684#endif // K0 > 8
685
686 // ---------------------------Transpose the block ------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000687 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000688
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000689#if K0 == 2
690 // This part computes the following transpositions:
691 // 2x2 -> 2x2
692 // 2x4 -> 4x2
693 // 2x8 -> 8x2
694 // 2x16 -> 16x2
695 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
696 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
697#if N0 > 2
698 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
699#endif // N0 > 2
700#if N0 > 3
701 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
702#endif // N0 > 3
703#if N0 > 4
704 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
705 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
706 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
707 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
708#endif // N0 > 4
709#if N0 > 8
710 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
711 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
712 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
713 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
714 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
715 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
716 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
717 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
718#endif // N0 > 8
719
720#elif K0 == 3 // K0 == 2
721 // This part computes the following transpositions:
722 // 3x2 -> 2x3
723 // 3x4 -> 4x3
724 // 3x8 -> 8x3
725 // 3x16 -> 16x3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100726 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
727 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000728#if N0 > 2
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100729 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000730#endif // N0 > 2
731#if N0 > 3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100732 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000733#endif // N0 > 3
734#if N0 > 4
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100735 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
736 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
737 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
738 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000739#endif // N0 > 4
740#if N0 > 8
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100741 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
742 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
743 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
744 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
745 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
746 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
747 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
748 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000749#endif // N0 > 8
750
751#elif K0 == 4 // K0 == 4
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000752 // This part computes the following transpositions:
753 // 4x2 -> 2x4
754 // 4x4 -> 4x4
755 // 4x8 -> 8x4
756 // 4x16 -> 16x4
757 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
758 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
759#if N0 > 2
760 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000761#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000762#if N0 > 3
763 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
764#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000765#if N0 > 4
766 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
767 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
768 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
769 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
770#endif // N0 > 4
771#if N0 > 8
772 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
773 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
774 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
775 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
776 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
777 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
778 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
779 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
780#endif // N0 > 8
781
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000782#elif K0 == 8 // K0 == 8
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000783 // This part computes the following transpositions:
784 // 8x2 -> 2x8
785 // 8x4 -> 4x8
786 // 8x8 -> 8x8
787 // 8x16 -> 16x8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000788 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
789 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000790#if N0 > 2
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000791 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000792#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000793#if N0 > 3
794 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
795#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000796#if N0 > 4
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000797 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
798 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
799 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
800 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000801#endif // N0 > 4
802#if N0 > 8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000803 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
804 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
805 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
806 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
807 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
808 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
809 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
810 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000811#endif // N0 > 8
812
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000813#elif K0 == 16 // K0 == 16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000814
815 // This part computes the following transpositions:
816 // 16x2 -> 2x16
817 // 16x4 -> 4x16
818 // 16x8 -> 8x16
819 // 16x16 -> 16x16
820 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
821 a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
822 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
823 a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
824#if N0 > 2
825 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
826 a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000827#endif // N0 > 2
828#if N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000829 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
830 a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000831#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000832#if N0 > 4
833 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
834 a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
835 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
836 a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
837 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
838 a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
839 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
840 a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
841#endif // N0 > 4
842#if N0 > 8
843 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
844 a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
845 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
846 a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
847 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
848 a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
849 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
850 a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
851 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
852 a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
853 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
854 a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
855 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
856 a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
857 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
858 a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
859#endif // N0 > 8
860
861#else // N0 == 16
862#error "Not supported N0 value"
863#endif // N0 > 2
864
865 // ---------------------------Store the output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100866 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
867 STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000868
869#undef BLOCK_SIZE
870#undef OUTPUT_OFFSET_X
871#undef OUTPUT_STEP_X
872}
873#endif // defined(TRANSPOSE)
874#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
875
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000876#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +0000877
878#define CONCAT(a, b) a##b
879
880#define ARM_DOT1(a, b, c) \
881 ({ \
882 c = fma(a, b, c); \
883 })
884#define ARM_DOT2(a, b, c) \
885 ({ \
886 c = fma(a.s0, b.s0, c); \
887 c = fma(a.s1, b.s1, c); \
888 })
889#define ARM_DOT3(a, b, c) \
890 ({ \
891 ARM_DOT2(a, b, c); \
892 c = fma((a.s2), (b.s2), c); \
893 })
894#define ARM_DOT4(a, b, c) \
895 ({ \
896 ARM_DOT3(a, b, c); \
897 c = fma((a.s3), (b.s3), c); \
898 })
899#define ARM_DOT8(a, b, c) \
900 ({ \
901 ARM_DOT4((a.lo), (b.lo), c); \
902 ARM_DOT4((a.hi), (b.hi), c); \
903 })
904#define ARM_DOT16(a, b, c) \
905 ({ \
906 ARM_DOT8((a.lo), (b.lo), c); \
907 ARM_DOT8((a.hi), (b.hi), c); \
908 })
909
910#if N0 == 2
911#define ARM_DOT_K0XN0(k0, a, b, c) \
912 ({ \
913 CONCAT(ARM_DOT, k0) \
914 ((a), (b##0), (c.s0)); \
915 CONCAT(ARM_DOT, k0) \
916 ((a), (b##1), (c.s1)); \
917 })
918#elif N0 == 3 // N0 == 3
919#define ARM_DOT_K0XN0(k0, a, b, c) \
920 ({ \
921 CONCAT(ARM_DOT, k0) \
922 ((a), (b##0), (c.s0)); \
923 CONCAT(ARM_DOT, k0) \
924 ((a), (b##1), (c.s1)); \
925 CONCAT(ARM_DOT, k0) \
926 ((a), (b##2), (c.s2)); \
927 })
928#elif N0 == 4 // N0 == 4
929#define ARM_DOT_K0XN0(k0, a, b, c) \
930 ({ \
931 CONCAT(ARM_DOT, k0) \
932 ((a), (b##0), (c.s0)); \
933 CONCAT(ARM_DOT, k0) \
934 ((a), (b##1), (c.s1)); \
935 CONCAT(ARM_DOT, k0) \
936 ((a), (b##2), (c.s2)); \
937 CONCAT(ARM_DOT, k0) \
938 ((a), (b##3), (c.s3)); \
939 })
940#elif N0 == 8 // N0 == 8
941#define ARM_DOT_K0XN0(k0, a, b, c) \
942 ({ \
943 CONCAT(ARM_DOT, k0) \
944 ((a), (b##0), (c.s0)); \
945 CONCAT(ARM_DOT, k0) \
946 ((a), (b##1), (c.s1)); \
947 CONCAT(ARM_DOT, k0) \
948 ((a), (b##2), (c.s2)); \
949 CONCAT(ARM_DOT, k0) \
950 ((a), (b##3), (c.s3)); \
951 CONCAT(ARM_DOT, k0) \
952 ((a), (b##4), (c.s4)); \
953 CONCAT(ARM_DOT, k0) \
954 ((a), (b##5), (c.s5)); \
955 CONCAT(ARM_DOT, k0) \
956 ((a), (b##6), (c.s6)); \
957 CONCAT(ARM_DOT, k0) \
958 ((a), (b##7), (c.s7)); \
959 })
960#elif N0 == 16 // N0 == 16
961#define ARM_DOT_K0XN0(k0, a, b, c) \
962 ({ \
963 CONCAT(ARM_DOT, k0) \
964 ((a), (b##0), (c.s0)); \
965 CONCAT(ARM_DOT, k0) \
966 ((a), (b##1), (c.s1)); \
967 CONCAT(ARM_DOT, k0) \
968 ((a), (b##2), (c.s2)); \
969 CONCAT(ARM_DOT, k0) \
970 ((a), (b##3), (c.s3)); \
971 CONCAT(ARM_DOT, k0) \
972 ((a), (b##4), (c.s4)); \
973 CONCAT(ARM_DOT, k0) \
974 ((a), (b##5), (c.s5)); \
975 CONCAT(ARM_DOT, k0) \
976 ((a), (b##6), (c.s6)); \
977 CONCAT(ARM_DOT, k0) \
978 ((a), (b##7), (c.s7)); \
979 CONCAT(ARM_DOT, k0) \
980 ((a), (b##8), (c.s8)); \
981 CONCAT(ARM_DOT, k0) \
982 ((a), (b##9), (c.s9)); \
983 CONCAT(ARM_DOT, k0) \
984 ((a), (b##A), (c.sA)); \
985 CONCAT(ARM_DOT, k0) \
986 ((a), (b##B), (c.sB)); \
987 CONCAT(ARM_DOT, k0) \
988 ((a), (b##C), (c.sC)); \
989 CONCAT(ARM_DOT, k0) \
990 ((a), (b##D), (c.sD)); \
991 CONCAT(ARM_DOT, k0) \
992 ((a), (b##E), (c.sE)); \
993 CONCAT(ARM_DOT, k0) \
994 ((a), (b##F), (c.sF)); \
995 })
996#else // N0 not supported
997#error "N0 value not supported"
998#endif // N0 conditions
999
1000/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1001 * The LHS matrix is NOT reshaped
1002 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1003 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001004 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001005 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1006 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
1007 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1008 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1009 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001010 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001011 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1012 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001013 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1014 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1015 * - N0 = 2, 3, 4, 8, 16
1016 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001017 * - H0 >= 1
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001018 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001019 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001020 * The activation function is performed after the bias addition
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001021 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1022 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1023 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1024 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1025 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1026 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1027 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001028 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1029 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001030 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001031 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001032 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001033 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001034 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1035 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1036 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1037 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1038 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1039 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001040 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1041 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1042 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1044 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001046 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1047 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1048 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1050 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001052 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001053 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001054 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001055 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1056 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1057 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001058 */
1059__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1060 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001061#if defined(BETA)
1062 IMAGE_DECLARATION(bias),
1063#endif // defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001064 IMAGE_DECLARATION(dst),
1065 uint lhs_stride_z,
1066 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001067#if defined(BETA)
1068 uint bias_stride_z,
1069#endif //defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001070 uint dst_stride_z
1071#if defined(REINTERPRET_INPUT_AS_3D)
1072 ,
1073 uint lhs_cross_plane_pad
1074#endif // REINTERPRET_INPUT_AS_3D
1075#if defined(REINTERPRET_OUTPUT_AS_3D)
1076 ,
1077 uint dst_cross_plane_pad
1078#endif // REINTERPRET_OUTPUT_AS_3D
1079 )
1080{
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001081 // Block size
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001082#define RHS_BLOCK_SIZE ((K0) * (N0))
1083
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001084 // RHS offset and step X
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001085#if defined(RHS_INTERLEAVE)
1086#define RHS_OFFSET_X (K0)
1087#define RHS_STEP_X ((K0) * (H0))
1088#define RHS_STEP_LOOP (1)
1089#else // defined(RHS_INTERLEAVE)
1090#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1091#define RHS_STEP_X (K0)
1092#define RHS_STEP_LOOP (H0)
1093#endif // defined(RHS_INTERLEAVE)
1094
1095 uint x = get_global_id(0);
1096 uint y = get_global_id(1);
1097 uint z = get_global_id(2);
1098
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001099#if defined(DUMMY_WORK_ITEMS)
1100 if((x * N0 >= N) || (y * M0 >= M))
1101 {
1102 return;
1103 }
1104#endif // defined(DUMMY_WORK_ITEMS)
1105
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001106 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001107 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001108
Sheri Zhang1a378102020-04-30 12:59:39 +01001109 // Compute RHS reshaped matrix address
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001110 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1111
1112#if defined(MATRIX_B_DEPTH)
1113 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1114 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1115#else // defined(MATRIX_B_DEPTH)
1116 rhs_offset += z * rhs_stride_z;
1117#endif // defined(MATRIX_B_DEPTH)
1118
Usama Arif0681e3b2019-04-25 14:28:07 +01001119 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001120 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001121
1122#if defined(REINTERPRET_INPUT_AS_3D)
Usama Arif0681e3b2019-04-25 14:28:07 +01001123 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1124 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001125
1126 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1127 // multiply lhs_stride_z by DEPTH_GEMM3D
1128 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1129
1130#else // defined(REINTERPRET_INPUT_AS_3D)
1131
1132 // Add offset for batched GEMM
1133 lhs_offset += z * lhs_stride_z;
1134
1135#endif // defined(REINTERPRET_INPUT_AS_3D)
1136
1137 // Initialize the accumulators
1138 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1139
1140 int i = 0;
1141 for(; i <= (K - K0); i += K0)
1142 {
1143 // Supported cases (M0, K0):
1144 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1145 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1146 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1147 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1148 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1149 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1150 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1151 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1152 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001153 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001154
Sheri Zhang1a378102020-04-30 12:59:39 +01001155 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001156 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001157
1158 // Accumulate
1159 ARM_DOT_K0XN0(K0, a0, b, c0);
1160#if M0 > 1
1161 ARM_DOT_K0XN0(K0, a1, b, c1);
1162#endif // M0 > 1
1163#if M0 > 2
1164 ARM_DOT_K0XN0(K0, a2, b, c2);
1165#endif // M0 > 2
1166#if M0 > 3
1167 ARM_DOT_K0XN0(K0, a3, b, c3);
1168#endif // M0 > 3
1169#if M0 > 4
1170 ARM_DOT_K0XN0(K0, a4, b, c4);
1171#endif // M0 > 4
1172#if M0 > 5
1173 ARM_DOT_K0XN0(K0, a5, b, c5);
1174#endif // M0 > 5
1175#if M0 > 6
1176 ARM_DOT_K0XN0(K0, a6, b, c6);
1177#endif // M0 > 6
1178#if M0 > 7
1179 ARM_DOT_K0XN0(K0, a7, b, c7);
1180#endif // M0 > 7
1181
1182 lhs_offset += K0 * sizeof(DATA_TYPE);
1183 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
1184 }
1185
1186 // Left-over accumulations
1187 for(; i < K; ++i)
1188 {
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001189 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001190 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001191
Sheri Zhang1a378102020-04-30 12:59:39 +01001192 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001193 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001194
1195 // Accumulate
1196 ARM_DOT_K0XN0(1, a0, b, c0);
1197#if M0 > 1
1198 ARM_DOT_K0XN0(1, a1, b, c1);
1199#endif // M0 > 1
1200#if M0 > 2
1201 ARM_DOT_K0XN0(1, a2, b, c2);
1202#endif // M0 > 2
1203#if M0 > 3
1204 ARM_DOT_K0XN0(1, a3, b, c3);
1205#endif // M0 > 3
1206#if M0 > 4
1207 ARM_DOT_K0XN0(1, a4, b, c4);
1208#endif // M0 > 4
1209#if M0 > 5
1210 ARM_DOT_K0XN0(1, a5, b, c5);
1211#endif // M0 > 5
1212#if M0 > 6
1213 ARM_DOT_K0XN0(1, a6, b, c6);
1214#endif // M0 > 6
1215#if M0 > 7
1216 ARM_DOT_K0XN0(1, a7, b, c7);
1217#endif // M0 > 7
1218
1219 lhs_offset += sizeof(DATA_TYPE);
1220 rhs_offset += sizeof(DATA_TYPE);
1221 }
1222
SiCong Li406a13f2020-07-15 12:09:58 +01001223 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001224
1225 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1226
1227#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001228
1229 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001230 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001231
1232 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1233 // multiply dst_stride_z by DEPTH_GEMM3D
1234 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1235
1236#else // defined(REINTERPRET_OUTPUT_AS_3D)
1237
1238 // Add offset for batched GEMM
1239 dst_addr += z * dst_stride_z;
1240
1241#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1242
1243 // Multiply by the weight of matrix-matrix product and store the result
1244#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001245 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001246#endif // defined(ALPHA)
1247
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001248 // Add beta*bias
1249#if defined(BETA)
1250#if defined(BROADCAST_BIAS)
1251 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1252
1253 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1254
1255#ifndef UNIT_BETA
1256 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1257#endif // UNIT_BIAS
1258
1259 // c = c + bias[broadcasted]
1260 ADD_BLOCK_BROADCAST(M0, c, bias0);
1261
1262#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001263 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001264
1265 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1266
1267#ifndef UNIT_BETA
1268 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1269#endif // UNIT_BIAS
1270
1271 // c = c + bias
1272 ADD_BLOCK(M0, c, bias);
1273
1274#endif // defined(BROADCAST_BIAS)
1275#endif // defined(BETA)
1276
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001277#if defined(ACTIVATION_TYPE)
1278 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
1279#endif // defined(ACTIVATION_TYPE)
1280
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001281 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01001282 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001283
1284#undef RHS_BLOCK_SIZE
1285#undef RHS_OFFSET_X
1286#undef RHS_STEP_X
1287}
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001288
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001289#if defined(OPENCL_IMAGE_SUPPORT)
1290/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
1291 * The LHS matrix is NOT reshaped
1292 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1293 *
1294 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
1295 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1296 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1297 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
1298 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
1299 * could be different from the value returned by get_image_height(rhs_img).
1300 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1301 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1302 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
1303 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001304 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1305 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001306 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1307 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1308 * - N0 = 4, 8, 16
1309 * - K0 = 4, 8, 16
1310 * - H0 >= 1
1311 *
1312 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
1313 * The activation function is performed after the bias addition
1314 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1315 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1316 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1317 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1318 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1319 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1320 *
1321 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
1322 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
1323 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1324 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
1325 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1326 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
1327 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
1328 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1329 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1330 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1331 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1332 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1333 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1334 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1335 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1336 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1337 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1338 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1339 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1340 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
1341 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1342 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
1343 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1344 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1345 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1346 */
1347__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
1348 __read_only image2d_t rhs_img,
1349#if defined(BETA)
1350 IMAGE_DECLARATION(bias),
1351#endif // defined(BETA)
1352 IMAGE_DECLARATION(dst),
1353 uint lhs_stride_z,
1354 uint rhs_stride_z,
1355#if defined(BETA)
1356 uint bias_stride_z,
1357#endif //defined(BETA)
1358 uint dst_stride_z
1359#if defined(REINTERPRET_INPUT_AS_3D)
1360 ,
1361 uint lhs_cross_plane_pad
1362#endif // REINTERPRET_INPUT_AS_3D
1363#if defined(REINTERPRET_OUTPUT_AS_3D)
1364 ,
1365 uint dst_cross_plane_pad
1366#endif // REINTERPRET_OUTPUT_AS_3D
1367 )
1368{
1369 // Pixel unit
1370#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
1371
1372#define LEFTOVER_K (K % K0)
1373
1374 // Block size
1375#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
1376
1377 // RHS offset and step X
1378#if defined(RHS_INTERLEAVE)
1379#define RHS_OFFSET_X (PIXEL_UNIT)
1380#define RHS_STEP_X (PIXEL_UNIT * (H0))
1381#define RHS_STEP_LOOP (1)
1382#else // defined(RHS_INTERLEAVE)
1383#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1384#define RHS_STEP_X PIXEL_UNIT
1385#define RHS_STEP_LOOP (H0)
1386#endif // defined(RHS_INTERLEAVE)
1387
1388 uint x = get_global_id(0);
1389 uint y = get_global_id(1);
1390 uint z = get_global_id(2);
1391
1392#if defined(DUMMY_WORK_ITEMS)
1393 if((x * N0 >= N) || (y * M0 >= M))
1394 {
1395 return;
1396 }
1397#endif // defined(DUMMY_WORK_ITEMS)
1398
1399 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001400 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001401
1402#if defined(MATRIX_B_DEPTH)
1403 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1404 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
1405#else // defined(MATRIX_B_DEPTH)
1406 const uint z_rhs = get_global_id(2);
1407#endif // defined(MATRIX_B_DEPTH)
1408
1409 // Compute RHS matrix coordinates
1410 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
1411 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
1412
1413 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
1414 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
1415
1416#if defined(REINTERPRET_INPUT_AS_3D)
1417 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1418 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
1419
1420 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1421 // multiply lhs_stride_z by DEPTH_GEMM3D
1422 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1423
1424#else // defined(REINTERPRET_INPUT_AS_3D)
1425
1426 // Add offset for batched GEMM
1427 lhs_offset += z * lhs_stride_z;
1428
1429#endif // defined(REINTERPRET_INPUT_AS_3D)
1430
1431 // Initialize the accumulators
1432 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
1433
1434 int i = 0;
1435 for(; i <= (K - K0); i += K0)
1436 {
1437 // Load values from LHS matrix
1438 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1439
1440 // Load values from RHS matrix stored in a cl_image
1441 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1442 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1443
1444 // Accumulate
1445 ARM_DOT_K0XN0(K0, a0, b, c0);
1446#if M0 > 1
1447 ARM_DOT_K0XN0(K0, a1, b, c1);
1448#endif // M0 > 1
1449#if M0 > 2
1450 ARM_DOT_K0XN0(K0, a2, b, c2);
1451#endif // M0 > 2
1452#if M0 > 3
1453 ARM_DOT_K0XN0(K0, a3, b, c3);
1454#endif // M0 > 3
1455#if M0 > 4
1456 ARM_DOT_K0XN0(K0, a4, b, c4);
1457#endif // M0 > 4
1458#if M0 > 5
1459 ARM_DOT_K0XN0(K0, a5, b, c5);
1460#endif // M0 > 5
1461#if M0 > 6
1462 ARM_DOT_K0XN0(K0, a6, b, c6);
1463#endif // M0 > 6
1464#if M0 > 7
1465 ARM_DOT_K0XN0(K0, a7, b, c7);
1466#endif // M0 > 7
1467
1468 lhs_offset += K0 * sizeof(DATA_TYPE);
1469 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1470 }
1471
1472#if LEFTOVER_K != 0
1473 // Note: We cannot read out-of-bound elements from the RHS matrix because
1474 // the RHS width is always multiple of K0. This is not be true for the LHS matrix
1475
1476 union UNION_VEC_TYPE
1477 {
1478 DATA_TYPE s[K0];
1479 VEC_DATA_TYPE(DATA_TYPE, K0)
1480 v;
1481 };
1482
1483 union UNION_VEC_TYPE a0 = {.v = 0 };
1484#if M0 > 1
1485 union UNION_VEC_TYPE a1 = {.v = 0 };
1486#endif // M0 > 1
1487#if M0 > 2
1488 union UNION_VEC_TYPE a2 = {.v = 0 };
1489#endif // M0 > 2
1490#if M0 > 3
1491 union UNION_VEC_TYPE a3 = {.v = 0 };
1492#endif // M0 > 3
1493#if M0 > 4
1494 union UNION_VEC_TYPE a4 = {.v = 0 };
1495#endif // M0 > 4
1496#if M0 > 5
1497 union UNION_VEC_TYPE a5 = {.v = 0 };
1498#endif // M0 > 5
1499#if M0 > 6
1500 union UNION_VEC_TYPE a6 = {.v = 0 };
1501#endif // M0 > 6
1502#if M0 > 7
1503 union UNION_VEC_TYPE a7 = {.v = 0 };
1504#endif // M0 > 7
1505
1506 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1507
1508 // Load from RHS matrix
1509 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1510
1511 // Load from LHS matrix
1512 for(int k = 0; k < LEFTOVER_K; ++k)
1513 {
1514 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
1515#if M0 > 1
1516 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
1517#endif // M0 > 1
1518#if M0 > 2
1519 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
1520#endif // M0 > 2
1521#if M0 > 3
1522 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
1523#endif // M0 > 3
1524#if M0 > 4
1525 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
1526#endif // M0 > 4
1527#if M0 > 5
1528 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
1529#endif // M0 > 5
1530#if M0 > 6
1531 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
1532#endif // M0 > 6
1533#if M0 > 7
1534 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
1535#endif // M0 > 7
1536
1537 lhs_offset += sizeof(DATA_TYPE);
1538 }
1539
1540 // Accumulate
1541 ARM_DOT_K0XN0(K0, a0.v, b, c0);
1542#if M0 > 1
1543 ARM_DOT_K0XN0(K0, a1.v, b, c1);
1544#endif // M0 > 1
1545#if M0 > 2
1546 ARM_DOT_K0XN0(K0, a2.v, b, c2);
1547#endif // M0 > 2
1548#if M0 > 3
1549 ARM_DOT_K0XN0(K0, a3.v, b, c3);
1550#endif // M0 > 3
1551#if M0 > 4
1552 ARM_DOT_K0XN0(K0, a4.v, b, c4);
1553#endif // M0 > 4
1554#if M0 > 5
1555 ARM_DOT_K0XN0(K0, a5.v, b, c5);
1556#endif // M0 > 5
1557#if M0 > 6
1558 ARM_DOT_K0XN0(K0, a6.v, b, c6);
1559#endif // M0 > 6
1560#if M0 > 7
1561 ARM_DOT_K0XN0(K0, a7.v, b, c7);
1562#endif // M0 > 7
1563
1564#endif // LEFTOVER_K != 0
1565
SiCong Li406a13f2020-07-15 12:09:58 +01001566 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001567
1568 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1569
1570#if defined(REINTERPRET_OUTPUT_AS_3D)
1571
1572 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1573 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
1574
1575 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1576 // multiply dst_stride_z by DEPTH_GEMM3D
1577 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1578
1579#else // defined(REINTERPRET_OUTPUT_AS_3D)
1580
1581 // Add offset for batched GEMM
1582 dst_addr += z * dst_stride_z;
1583
1584#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1585
1586 // Multiply by the weight of matrix-matrix product and store the result
1587#if defined(ALPHA)
1588 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
1589#endif // defined(ALPHA)
1590
1591 // Add beta*bias
1592#if defined(BETA)
1593#if defined(BROADCAST_BIAS)
1594 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1595
1596 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1597
1598#ifndef UNIT_BETA
1599 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1600#endif // UNIT_BIAS
1601
1602 // c = c + bias[broadcasted]
1603 ADD_BLOCK_BROADCAST(M0, c, bias0);
1604
1605#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001606 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001607
1608 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1609
1610#ifndef UNIT_BETA
1611 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1612#endif // UNIT_BIAS
1613
1614 // c = c + bias
1615 ADD_BLOCK(M0, c, bias);
1616
1617#endif // defined(BROADCAST_BIAS)
1618#endif // defined(BETA)
1619
1620#if defined(ACTIVATION_TYPE)
1621 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
1622#endif // defined(ACTIVATION_TYPE)
1623
1624 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01001625 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001626
1627#undef RHS_BLOCK_SIZE
1628#undef RHS_OFFSET_X
1629#undef RHS_STEP_X
1630#undef LEFTOVER_K
1631#undef PIXEL_UNIT
1632}
1633#endif // defined(OPENCL_IMAGE_SUPPORT)
1634
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001635#define VFMA(a, b, c) \
1636 ({ \
1637 c = fma(a, b, c); \
1638 })
1639
1640#if M0 == 1
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001641#define VFMA_M0xN0(i, a, b, c) \
1642 ({ \
1643 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001644 })
1645#elif M0 == 2 // M0 == 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001646#define VFMA_M0xN0(i, a, b, c) \
1647 ({ \
1648 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1649 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001650 })
1651#elif M0 == 3 // M0 == 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001652#define VFMA_M0xN0(i, a, b, c) \
1653 ({ \
1654 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1655 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1656 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001657 })
1658#elif M0 == 4 // M0 == 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001659#define VFMA_M0xN0(i, a, b, c) \
1660 ({ \
1661 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1662 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1663 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1664 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001665 })
1666#elif M0 == 5 // M0 == 5
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001667#define VFMA_M0xN0(i, a, b, c) \
1668 ({ \
1669 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1670 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1671 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1672 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1673 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001674 })
1675#elif M0 == 6 // M0 == 6
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001676#define VFMA_M0xN0(i, a, b, c) \
1677 ({ \
1678 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1679 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1680 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1681 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1682 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1683 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001684 })
1685#elif M0 == 7 // M0 == 7
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001686#define VFMA_M0xN0(i, a, b, c) \
1687 ({ \
1688 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1689 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1690 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1691 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1692 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1693 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1694 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001695 })
1696#elif M0 == 8 // M0 == 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001697#define VFMA_M0xN0(i, a, b, c) \
1698 ({ \
1699 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1700 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1701 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1702 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1703 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1704 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1705 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
1706 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001707 })
1708#else // M0 not supported
1709#error "M0 not supported"
1710#endif // M0 not supported
1711
1712/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1713 * The LHS matrix is NOT reshaped
1714 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
1715 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001716 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001717 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
1718 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1719 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1720 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001721 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001722 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1723 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001724 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1725 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1726 * - N0 = 2, 3, 4, 8, 16
1727 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001728 * - H0 >= 1
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001729 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001730 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001731 * The activation function is performed after the bias addition
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001732 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1733 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1734 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1735 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1736 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1737 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1738 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001739 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1740 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001741 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001742 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001743 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001744 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001745 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1746 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1747 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1748 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1749 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1750 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001751 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1752 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001753 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001754 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001755 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1756 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1757 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1758 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1759 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1760 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1761 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1762 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001763 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001764 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001765 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001766 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1767 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1768 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001769 */
1770__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
1771 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001772#if defined(BETA)
1773 IMAGE_DECLARATION(bias),
1774#endif // defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001775 IMAGE_DECLARATION(dst),
1776 uint lhs_stride_z,
1777 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001778#if defined(BETA)
1779 uint bias_stride_z,
1780#endif //defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001781 uint dst_stride_z
1782#if defined(REINTERPRET_INPUT_AS_3D)
1783 ,
1784 uint lhs_cross_plane_pad
1785#endif // REINTERPRET_INPUT_AS_3D
1786#if defined(REINTERPRET_OUTPUT_AS_3D)
1787 ,
1788 uint dst_cross_plane_pad
1789#endif // REINTERPRET_OUTPUT_AS_3D
1790 )
1791{
1792 // Block size
1793#define RHS_BLOCK_SIZE ((K0) * (N0))
1794
1795 // RHS offset and step X
1796#if defined(RHS_INTERLEAVE)
1797#define RHS_OFFSET_X (N0)
1798#define RHS_STEP_X ((N0) * (H0))
1799#define RHS_STEP_LOOP (1)
1800#else // defined(RHS_INTERLEAVE)
1801#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1802#define RHS_STEP_X (N0)
1803#define RHS_STEP_LOOP (H0)
1804#endif // defined(RHS_INTERLEAVE)
1805
1806 uint x = get_global_id(0);
1807 uint y = get_global_id(1);
1808 uint z = get_global_id(2);
1809
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001810#if defined(DUMMY_WORK_ITEMS)
1811 if((x * N0 >= N) || (y * M0 >= M))
1812 {
1813 return;
1814 }
1815#endif // defined(DUMMY_WORK_ITEMS)
1816
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001817 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001818 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001819
Sheri Zhang1a378102020-04-30 12:59:39 +01001820 // Compute RHS reshaped matrix address
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001821 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1822
1823#if defined(MATRIX_B_DEPTH)
1824 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1825 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1826#else // defined(MATRIX_B_DEPTH)
1827 rhs_offset += z * rhs_stride_z;
1828#endif // defined(MATRIX_B_DEPTH)
1829
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001830 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
1831 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001832
1833#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001834
1835 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001836 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001837
1838 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1839 // multiply lhs_stride_z by DEPTH_GEMM3D
1840 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1841
1842#else // defined(REINTERPRET_INPUT_AS_3D)
1843
1844 // Add offset for batched GEMM
1845 lhs_offset += z * lhs_stride_z;
1846
1847#endif // defined(REINTERPRET_INPUT_AS_3D)
1848
1849 // Initialize the accumulators
1850 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1851
1852 int i = 0;
1853 for(; i <= (K - K0); i += K0)
1854 {
1855 // Supported cases (M0, K0):
1856 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1857 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1858 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1859 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1860 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1861 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1862 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1863 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1864 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001865 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001866
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001867 VEC_DATA_TYPE(DATA_TYPE, N0)
1868 b0;
1869
1870 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1871 VFMA_M0xN0(0, a, b0, c);
1872 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
1873 VFMA_M0xN0(1, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001874#if K0 > 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001875 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
1876 VFMA_M0xN0(2, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001877#endif // K0 > 2
1878#if K0 > 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001879 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
1880 VFMA_M0xN0(3, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001881#endif // K0 > 3
1882#if K0 > 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001883 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
1884 VFMA_M0xN0(4, a, b0, c);
1885 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
1886 VFMA_M0xN0(5, a, b0, c);
1887 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
1888 VFMA_M0xN0(6, a, b0, c);
1889 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
1890 VFMA_M0xN0(7, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001891#endif // K0 > 4
1892#if K0 > 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001893 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
1894 VFMA_M0xN0(8, a, b0, c);
1895 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
1896 VFMA_M0xN0(9, a, b0, c);
1897 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
1898 VFMA_M0xN0(A, a, b0, c);
1899 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
1900 VFMA_M0xN0(B, a, b0, c);
1901 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
1902 VFMA_M0xN0(C, a, b0, c);
1903 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
1904 VFMA_M0xN0(D, a, b0, c);
1905 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
1906 VFMA_M0xN0(E, a, b0, c);
1907 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
1908 VFMA_M0xN0(F, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001909#endif // K0 > 8
1910
1911 lhs_offset += K0 * sizeof(DATA_TYPE);
1912 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
1913 }
1914
1915 // Left-over accumulations
1916 for(; i < K; ++i)
1917 {
1918 // Load values from LHS matrix
1919 VEC_DATA_TYPE(DATA_TYPE, 2)
1920 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
1921#if M0 > 1
1922 VEC_DATA_TYPE(DATA_TYPE, 2)
1923 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
1924#endif // M0 > 1
1925#if M0 > 2
1926 VEC_DATA_TYPE(DATA_TYPE, 2)
1927 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
1928#endif // M0 > 2
1929#if M0 > 3
1930 VEC_DATA_TYPE(DATA_TYPE, 2)
1931 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
1932#endif // M0 > 3
1933#if M0 > 4
1934 VEC_DATA_TYPE(DATA_TYPE, 2)
1935 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
1936#endif // M0 > 4
1937#if M0 > 5
1938 VEC_DATA_TYPE(DATA_TYPE, 2)
1939 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
1940#endif // M0 > 5
1941#if M0 > 6
1942 VEC_DATA_TYPE(DATA_TYPE, 2)
1943 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
1944#endif // M0 > 6
1945#if M0 > 7
1946 VEC_DATA_TYPE(DATA_TYPE, 2)
giuros01b3204e72019-04-01 13:50:22 +01001947 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001948#endif // M0 > 7
1949
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001950 VEC_DATA_TYPE(DATA_TYPE, N0)
1951 b0;
1952
1953 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1954 VFMA_M0xN0(0, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001955
1956 lhs_offset += sizeof(DATA_TYPE);
1957 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
1958 }
1959
SiCong Li406a13f2020-07-15 12:09:58 +01001960 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001961
1962 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1963
1964#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001965 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01001966 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001967
1968 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1969 // multiply dst_stride_z by DEPTH_GEMM3D
1970 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1971
1972#else // defined(REINTERPRET_OUTPUT_AS_3D)
1973
1974 // Add offset for batched GEMM
1975 dst_addr += z * dst_stride_z;
1976
1977#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1978
1979 // Multiply by the weight of matrix-matrix product and store the result
1980#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001981 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001982#endif // defined(ALPHA)
1983
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001984 // Add beta*bias
1985#if defined(BETA)
1986#if defined(BROADCAST_BIAS)
1987 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1988
1989 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1990
1991#ifndef UNIT_BETA
1992 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1993#endif // UNIT_BIAS
1994
1995 // c = c + bias[broadcasted]
1996 ADD_BLOCK_BROADCAST(M0, c, bias0);
1997
1998#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001999 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01002000
2001 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2002
2003#ifndef UNIT_BETA
2004 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2005#endif // UNIT_BIAS
2006
2007 // c = c + bias
2008 ADD_BLOCK(M0, c, bias);
2009
2010#endif // defined(BROADCAST_BIAS)
2011#endif // defined(BETA)
2012
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002013#if defined(ACTIVATION_TYPE)
2014 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2015#endif // defined(ACTIVATION_TYPE)
2016
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002017 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01002018 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002019
2020#undef RHS_BLOCK_SIZE
2021#undef RHS_OFFSET_X
2022#undef RHS_STEP_X
2023}
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002024
2025#if defined(OPENCL_IMAGE_SUPPORT)
2026/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2027 * The LHS matrix is NOT reshaped
2028 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
2029 *
2030 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2031 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2032 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
2033 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2034 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2035 * could be different from the value returned by get_image_height(rhs_img).
2036 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
2037 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
2038 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2039 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01002040 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2041 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002042 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2043 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
2044 * - N0 = 4, 8, 16
2045 * - K0 = 4, 8, 16
2046 * - H0 >= 1
2047 *
2048 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2049 * The activation function is performed after the bias addition
2050 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
2051 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
2052 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2053 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2054 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2055 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
2056 *
2057 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
2058 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
2059 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2060 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
2061 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2062 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
2063 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2064 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2065 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2066 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2067 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2068 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2069 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2070 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2071 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2072 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2073 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2074 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2075 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2076 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
2077 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2078 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2079 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2080 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
2081 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2082 */
2083__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
2084 __read_only image2d_t rhs_img,
2085#if defined(BETA)
2086 IMAGE_DECLARATION(bias),
2087#endif // defined(BETA)
2088 IMAGE_DECLARATION(dst),
2089 uint lhs_stride_z,
2090 uint rhs_stride_z,
2091#if defined(BETA)
2092 uint bias_stride_z,
2093#endif //defined(BETA)
2094 uint dst_stride_z
2095#if defined(REINTERPRET_INPUT_AS_3D)
2096 ,
2097 uint lhs_cross_plane_pad
2098#endif // REINTERPRET_INPUT_AS_3D
2099#if defined(REINTERPRET_OUTPUT_AS_3D)
2100 ,
2101 uint dst_cross_plane_pad
2102#endif // REINTERPRET_OUTPUT_AS_3D
2103 )
2104{
2105 // Pixel unit
2106#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
2107
2108 // Block size
2109#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
2110
2111 // RHS offset and step X
2112#if defined(RHS_INTERLEAVE)
2113#define RHS_OFFSET_X (PIXEL_UNIT)
2114#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
2115#else // defined(RHS_INTERLEAVE)
2116#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2117#define RHS_STEP_X (PIXEL_UNIT)
2118#endif // defined(RHS_INTERLEAVE)
2119
2120 uint x = get_global_id(0);
2121 uint y = get_global_id(1);
2122 uint z = get_global_id(2);
2123
2124#if defined(DUMMY_WORK_ITEMS)
2125 if((x * N0 >= N) || (y * M0 >= M))
2126 {
2127 return;
2128 }
2129#endif // defined(DUMMY_WORK_ITEMS)
2130
2131 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01002132 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002133
2134#if defined(MATRIX_B_DEPTH)
2135 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2136 const uint z_rhs = (z % MATRIX_B_DEPTH);
2137#else // defined(MATRIX_B_DEPTH)
2138 const uint z_rhs = z;
2139#endif // defined(MATRIX_B_DEPTH)
2140
2141 // Compute RHS matrix coordinates
2142 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
2143 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
2144
2145 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
2146 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2147
2148#if defined(REINTERPRET_INPUT_AS_3D)
2149
2150 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2151 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
2152
2153 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2154 // multiply lhs_stride_z by DEPTH_GEMM3D
2155 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
2156
2157#else // defined(REINTERPRET_INPUT_AS_3D)
2158
2159 // Add offset for batched GEMM
2160 lhs_offset += z * lhs_stride_z;
2161
2162#endif // defined(REINTERPRET_INPUT_AS_3D)
2163
2164 // Initialize the accumulators
2165 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
2166
2167 int i = 0;
2168 for(; i <= (K - K0); i += K0)
2169 {
2170 // Load values from LHS matrix
2171 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
2172
2173 VEC_DATA_TYPE(DATA_TYPE, N0)
2174 b0;
2175
2176 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2177 VFMA_M0xN0(0, a, b0, c);
2178 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
2179 VFMA_M0xN0(1, a, b0, c);
2180#if K0 > 2
2181 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
2182 VFMA_M0xN0(2, a, b0, c);
2183#endif // K0 > 2
2184#if K0 > 3
2185 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
2186 VFMA_M0xN0(3, a, b0, c);
2187#endif // K0 > 3
2188#if K0 > 4
2189 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
2190 VFMA_M0xN0(4, a, b0, c);
2191 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
2192 VFMA_M0xN0(5, a, b0, c);
2193 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
2194 VFMA_M0xN0(6, a, b0, c);
2195 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
2196 VFMA_M0xN0(7, a, b0, c);
2197#endif // K0 > 4
2198#if K0 > 8
2199 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
2200 VFMA_M0xN0(8, a, b0, c);
2201 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
2202 VFMA_M0xN0(9, a, b0, c);
2203 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
2204 VFMA_M0xN0(A, a, b0, c);
2205 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
2206 VFMA_M0xN0(B, a, b0, c);
2207 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
2208 VFMA_M0xN0(C, a, b0, c);
2209 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
2210 VFMA_M0xN0(D, a, b0, c);
2211 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
2212 VFMA_M0xN0(E, a, b0, c);
2213 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
2214 VFMA_M0xN0(F, a, b0, c);
2215#endif // K0 > 8
2216
2217 lhs_offset += K0 * sizeof(DATA_TYPE);
2218 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
2219 }
2220
2221 // Left-over accumulations
2222 for(; i < K; ++i)
2223 {
2224 // Load values from LHS matrix
2225 VEC_DATA_TYPE(DATA_TYPE, 2)
2226 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
2227#if M0 > 1
2228 VEC_DATA_TYPE(DATA_TYPE, 2)
2229 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
2230#endif // M0 > 1
2231#if M0 > 2
2232 VEC_DATA_TYPE(DATA_TYPE, 2)
2233 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
2234#endif // M0 > 2
2235#if M0 > 3
2236 VEC_DATA_TYPE(DATA_TYPE, 2)
2237 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
2238#endif // M0 > 3
2239#if M0 > 4
2240 VEC_DATA_TYPE(DATA_TYPE, 2)
2241 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
2242#endif // M0 > 4
2243#if M0 > 5
2244 VEC_DATA_TYPE(DATA_TYPE, 2)
2245 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
2246#endif // M0 > 5
2247#if M0 > 6
2248 VEC_DATA_TYPE(DATA_TYPE, 2)
2249 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
2250#endif // M0 > 6
2251#if M0 > 7
2252 VEC_DATA_TYPE(DATA_TYPE, 2)
2253 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
2254#endif // M0 > 7
2255
2256 VEC_DATA_TYPE(DATA_TYPE, N0)
2257 b0;
2258 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2259
2260 VFMA_M0xN0(0, a, b0, c);
2261
2262 lhs_offset += sizeof(DATA_TYPE);
2263 x_rhs += RHS_STEP_X;
2264 }
2265
SiCong Li406a13f2020-07-15 12:09:58 +01002266 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002267
2268 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
2269
2270#if defined(REINTERPRET_OUTPUT_AS_3D)
2271 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2272 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2273
2274 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2275 // multiply dst_stride_z by DEPTH_GEMM3D
2276 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2277
2278#else // defined(REINTERPRET_OUTPUT_AS_3D)
2279
2280 // Add offset for batched GEMM
2281 dst_addr += z * dst_stride_z;
2282
2283#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2284
2285 // Multiply by the weight of matrix-matrix product and store the result
2286#if defined(ALPHA)
2287 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2288#endif // defined(ALPHA)
2289
2290 // Add beta*bias
2291#if defined(BETA)
2292#if defined(BROADCAST_BIAS)
2293 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2294
2295 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2296
2297#ifndef UNIT_BETA
2298 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2299#endif // UNIT_BIAS
2300
2301 // c = c + bias[broadcasted]
2302 ADD_BLOCK_BROADCAST(M0, c, bias0);
2303
2304#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002305 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002306
2307 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2308
2309#ifndef UNIT_BETA
2310 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2311#endif // UNIT_BIAS
2312
2313 // c = c + bias
2314 ADD_BLOCK(M0, c, bias);
2315
2316#endif // defined(BROADCAST_BIAS)
2317#endif // defined(BETA)
2318
2319#if defined(ACTIVATION_TYPE)
2320 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2321#endif // defined(ACTIVATION_TYPE)
2322
2323 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01002324 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002325
2326#undef RHS_BLOCK_SIZE
2327#undef RHS_OFFSET_X
2328#undef RHS_STEP_X
2329}
2330#endif // defined(OPENCL_IMAGE_SUPPORT)
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002331#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002332
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002333#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002334
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002335#if defined(MIXED_PRECISION)
2336#if K0 == 2
2337#define ARM_DOT_K0(a, b, c) \
2338 ({ \
2339 c += a.s0 * b.s0; \
2340 c += a.s1 * b.s1; \
2341 })
2342#elif K0 == 3 // K0 == 3
2343#define ARM_DOT_K0(a, b, c) \
2344 ({ \
2345 c += a.s0 * b.s0; \
2346 c += a.s1 * b.s1; \
2347 c += a.s2 * b.s2; \
2348 })
2349#elif K0 == 4 // K0 == 4
2350#define ARM_DOT_K0(a, b, c) \
2351 ({ \
2352 c += a.s0 * b.s0; \
2353 c += a.s1 * b.s1; \
2354 c += a.s2 * b.s2; \
2355 c += a.s3 * b.s3; \
2356 })
2357#elif K0 == 8 // K0 == 8
2358#define ARM_DOT_K0(a, b, c) \
2359 ({ \
2360 c += a.s0 * b.s0; \
2361 c += a.s1 * b.s1; \
2362 c += a.s2 * b.s2; \
2363 c += a.s3 * b.s3; \
2364 c += a.s4 * b.s4; \
2365 c += a.s5 * b.s5; \
2366 c += a.s6 * b.s6; \
2367 c += a.s7 * b.s7; \
2368 })
2369#elif K0 == 16 // K0 == 16
2370#define ARM_DOT_K0(a, b, c) \
2371 ({ \
2372 c += a.s0 * b.s0; \
2373 c += a.s1 * b.s1; \
2374 c += a.s2 * b.s2; \
2375 c += a.s3 * b.s3; \
2376 c += a.s4 * b.s4; \
2377 c += a.s5 * b.s5; \
2378 c += a.s6 * b.s6; \
2379 c += a.s7 * b.s7; \
2380 c += a.s8 * b.s8; \
2381 c += a.s9 * b.s9; \
2382 c += a.sA * b.sA; \
2383 c += a.sB * b.sB; \
2384 c += a.sC * b.sC; \
2385 c += a.sD * b.sD; \
2386 c += a.sE * b.sE; \
2387 c += a.sF * b.sF; \
2388 })
2389#else // K0 not supported
2390#error "K0 value not supported"
2391#endif // K0 conditions
2392#else // defined(MIXED_PRECISION)
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002393#if K0 == 2
2394#define ARM_DOT_K0(a, b, c) \
2395 ({ \
2396 c = fma(a.s0, b.s0, c); \
2397 c = fma(a.s1, b.s1, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002398 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002399#elif K0 == 3 // K0 == 3
2400#define ARM_DOT_K0(a, b, c) \
2401 ({ \
2402 c = fma(a.s0, b.s0, c); \
2403 c = fma(a.s1, b.s1, c); \
2404 c = fma(a.s2, b.s2, c); \
2405 })
2406#elif K0 == 4 // K0 == 4
2407#define ARM_DOT_K0(a, b, c) \
2408 ({ \
2409 c = fma(a.s0, b.s0, c); \
2410 c = fma(a.s1, b.s1, c); \
2411 c = fma(a.s2, b.s2, c); \
2412 c = fma(a.s3, b.s3, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002413 })
2414#elif K0 == 8 // K0 == 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002415#define ARM_DOT_K0(a, b, c) \
2416 ({ \
2417 c = fma(a.s0, b.s0, c); \
2418 c = fma(a.s1, b.s1, c); \
2419 c = fma(a.s2, b.s2, c); \
2420 c = fma(a.s3, b.s3, c); \
2421 c = fma(a.s4, b.s4, c); \
2422 c = fma(a.s5, b.s5, c); \
2423 c = fma(a.s6, b.s6, c); \
2424 c = fma(a.s7, b.s7, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002425 })
2426#elif K0 == 16 // K0 == 16
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002427#define ARM_DOT_K0(a, b, c) \
2428 ({ \
2429 c = fma(a.s0, b.s0, c); \
2430 c = fma(a.s1, b.s1, c); \
2431 c = fma(a.s2, b.s2, c); \
2432 c = fma(a.s3, b.s3, c); \
2433 c = fma(a.s4, b.s4, c); \
2434 c = fma(a.s5, b.s5, c); \
2435 c = fma(a.s6, b.s6, c); \
2436 c = fma(a.s7, b.s7, c); \
2437 c = fma(a.s8, b.s8, c); \
2438 c = fma(a.s9, b.s9, c); \
2439 c = fma(a.sA, b.sA, c); \
2440 c = fma(a.sB, b.sB, c); \
2441 c = fma(a.sC, b.sC, c); \
2442 c = fma(a.sD, b.sD, c); \
2443 c = fma(a.sE, b.sE, c); \
2444 c = fma(a.sF, b.sF, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002445 })
2446#else // K0 not supported
2447#error "K0 value not supported"
2448#endif // K0 conditions
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002449#endif // defined(MIXED_PRECISION)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002450
2451#if N0 == 2
2452#define ARM_DOT_K0XN0(a, b, c) \
2453 ({ \
2454 ARM_DOT_K0((a), (b##0), (c.s0)); \
2455 ARM_DOT_K0((a), (b##1), (c.s1)); \
2456 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002457#elif N0 == 3 // N0 == 3
2458#define ARM_DOT_K0XN0(a, b, c) \
2459 ({ \
2460 ARM_DOT_K0((a), (b##0), (c.s0)); \
2461 ARM_DOT_K0((a), (b##1), (c.s1)); \
2462 ARM_DOT_K0((a), (b##2), (c.s2)); \
2463 })
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002464#elif N0 == 4 // N0 == 4
2465#define ARM_DOT_K0XN0(a, b, c) \
2466 ({ \
2467 ARM_DOT_K0((a), (b##0), (c.s0)); \
2468 ARM_DOT_K0((a), (b##1), (c.s1)); \
2469 ARM_DOT_K0((a), (b##2), (c.s2)); \
2470 ARM_DOT_K0((a), (b##3), (c.s3)); \
2471 })
2472#elif N0 == 8 // N0 == 8
2473#define ARM_DOT_K0XN0(a, b, c) \
2474 ({ \
2475 ARM_DOT_K0((a), (b##0), (c.s0)); \
2476 ARM_DOT_K0((a), (b##1), (c.s1)); \
2477 ARM_DOT_K0((a), (b##2), (c.s2)); \
2478 ARM_DOT_K0((a), (b##3), (c.s3)); \
2479 ARM_DOT_K0((a), (b##4), (c.s4)); \
2480 ARM_DOT_K0((a), (b##5), (c.s5)); \
2481 ARM_DOT_K0((a), (b##6), (c.s6)); \
2482 ARM_DOT_K0((a), (b##7), (c.s7)); \
2483 })
2484#elif N0 == 16 // N0 == 16
2485#define ARM_DOT_K0XN0(a, b, c) \
2486 ({ \
2487 ARM_DOT_K0((a), (b##0), (c.s0)); \
2488 ARM_DOT_K0((a), (b##1), (c.s1)); \
2489 ARM_DOT_K0((a), (b##2), (c.s2)); \
2490 ARM_DOT_K0((a), (b##3), (c.s3)); \
2491 ARM_DOT_K0((a), (b##4), (c.s4)); \
2492 ARM_DOT_K0((a), (b##5), (c.s5)); \
2493 ARM_DOT_K0((a), (b##6), (c.s6)); \
2494 ARM_DOT_K0((a), (b##7), (c.s7)); \
2495 ARM_DOT_K0((a), (b##8), (c.s8)); \
2496 ARM_DOT_K0((a), (b##9), (c.s9)); \
2497 ARM_DOT_K0((a), (b##A), (c.sA)); \
2498 ARM_DOT_K0((a), (b##B), (c.sB)); \
2499 ARM_DOT_K0((a), (b##C), (c.sC)); \
2500 ARM_DOT_K0((a), (b##D), (c.sD)); \
2501 ARM_DOT_K0((a), (b##E), (c.sE)); \
2502 ARM_DOT_K0((a), (b##F), (c.sF)); \
2503 })
2504#else // N0 not supported
2505#error "N0 value not supported"
2506#endif // N0 conditions
2507
2508/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2509 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2510 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2511 *
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002512 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2513 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2514 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002515 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002516 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002517 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2518 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2519 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002520 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2521 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
2522 * @note Only the following configurations of M0, N0 and K0 are currently supported:
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01002523 * - M0 = 2, 3, 4, 5, 6, 7, 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002524 * - N0 = 2, 3, 4, 8, 16
2525 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00002526 * - V0 >= 1
2527 * - H0 >= 1
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002528 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002529 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002530 * The activation function is performed after the bias addition
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002531 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002532 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2533 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2534 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2535 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2536 *
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002537 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
2538 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2539 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2540 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2541 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2542 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2543 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
2544 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
2545 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2546 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
2547 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2548 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
2549 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2550 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2551 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2552 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2553 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2554 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2555 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2556 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2557 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2558 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2559 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2560 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002561 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002562 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2563 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2564 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2565 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2566 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002567 */
2568__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
2569 IMAGE_DECLARATION(rhs),
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002570#if defined(BETA)
2571 IMAGE_DECLARATION(bias),
2572#endif // defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002573 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002574 uint k,
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002575 uint lhs_stride_z,
2576 uint rhs_stride_z,
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002577#if defined(BETA)
2578 uint bias_stride_z,
2579#endif //defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002580 uint dst_stride_z
2581#if defined(REINTERPRET_OUTPUT_AS_3D)
2582 ,
2583 uint dst_cross_plane_pad
2584#endif // REINTERPRET_OUTPUT_AS_3D
2585 )
2586{
2587 // Block size
2588#define LHS_BLOCK_SIZE ((K0) * (M0))
2589
2590#if defined(LHS_INTERLEAVE)
2591#define LHS_OFFSET_X (K0)
2592#define LHS_STEP_X ((K0) * (V0))
2593#define LHS_STEP_LOOP (1)
2594#else // defined(INTERLEAVE)
2595#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2596#define LHS_STEP_X (K0)
2597#define LHS_STEP_LOOP (V0)
2598#endif // defined(INTERLEAVE)
2599
2600 // Block size
2601#define RHS_BLOCK_SIZE ((K0) * (N0))
2602
2603 // RHS offset and step X
2604#if defined(RHS_INTERLEAVE)
2605#define RHS_OFFSET_X (K0)
2606#define RHS_STEP_X ((K0) * (H0))
2607#define RHS_STEP_LOOP (1)
2608#else // defined(RHS_INTERLEAVE)
2609#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2610#define RHS_STEP_X (K0)
2611#define RHS_STEP_LOOP (H0)
2612#endif // defined(RHS_INTERLEAVE)
2613
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002614#if defined(DUMMY_WORK_ITEMS)
2615 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2616 {
2617 return;
2618 }
2619#endif // defined(DUMMY_WORK_ITEMS)
2620
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002621 // Compute LHS matrix address
2622 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2623 (get_global_id(2) * lhs_stride_z);
2624
2625 // Compute RHS matrix address
2626 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
2627
2628#if defined(MATRIX_B_DEPTH)
2629 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2630 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
2631#else // defined(MATRIX_B_DEPTH)
2632 rhs_addr += get_global_id(2) * rhs_stride_z;
2633#endif // defined(MATRIX_B_DEPTH)
2634
2635 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002636 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002637
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002638 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2639 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Usama Arif0681e3b2019-04-25 14:28:07 +01002640
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002641 for(int i = 0; i < k; i += K0)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002642 {
2643 // Supported cases (M0, K0):
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002644 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
2645 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
2646 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
2647 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
2648 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
2649 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
2650 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
2651 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002652 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01002653 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002654
2655 // Load values from RHS matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002656 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002657
2658 // Accumulate
2659 ARM_DOT_K0XN0(a0, b, c0);
2660#if M0 > 1
2661 ARM_DOT_K0XN0(a1, b, c1);
2662#endif // M0 > 1
2663#if M0 > 2
2664 ARM_DOT_K0XN0(a2, b, c2);
2665#endif // M0 > 2
2666#if M0 > 3
2667 ARM_DOT_K0XN0(a3, b, c3);
2668#endif // M0 > 3
2669#if M0 > 4
2670 ARM_DOT_K0XN0(a4, b, c4);
2671#endif // M0 > 4
2672#if M0 > 5
2673 ARM_DOT_K0XN0(a5, b, c5);
2674#endif // M0 > 5
2675#if M0 > 6
2676 ARM_DOT_K0XN0(a6, b, c6);
2677#endif // M0 > 6
2678#if M0 > 7
2679 ARM_DOT_K0XN0(a7, b, c7);
2680#endif // M0 > 7
2681
2682 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2683 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
2684 }
2685
2686 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2687
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002688 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002689
2690#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002691
2692 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +01002693 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002694 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2695 // multiply dst_stride_z by DEPTH_GEMM3D
2696 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2697
2698#else // defined(REINTERPRET_OUTPUT_AS_3D)
2699
2700 // Add offset for batched GEMM
2701 dst_addr += get_global_id(2) * dst_stride_z;
2702
2703#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2704
2705 // Multiply by the weight of matrix-matrix product and store the result
2706#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01002707 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002708#endif // defined(ALPHA)
2709
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002710 // Add beta*bias
2711#if defined(BETA)
2712#if defined(BROADCAST_BIAS)
2713 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2714
2715 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2716
2717#ifndef UNIT_BETA
2718 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2719#endif // UNIT_BIAS
2720
2721 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002722#if defined(MIXED_PRECISION)
2723 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2724 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2725#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002726 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002727#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002728
2729#else // defined(BROADCAST_BIAS)
2730 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2731 2) * bias_stride_z;
2732
2733 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2734
2735#ifndef UNIT_BETA
2736 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2737#endif // UNIT_BIAS
2738
2739 // c = c + bias
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002740#if defined(MIXED_PRECISION)
2741 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2742 ADD_BLOCK(M0, c, bias_hp);
2743#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002744 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002745#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002746
2747#endif // defined(BROADCAST_BIAS)
2748#endif // defined(BETA)
2749
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002750#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002751#if defined(MIXED_PRECISION)
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002752 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002753#else // defined(MIXED_PRECISION)
2754 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
2755#endif // defined(MIXED_PRECISION)
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002756#endif // defined(ACTIVATION_TYPE)
2757
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002758 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002759#if defined(MIXED_PRECISION)
2760 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
2761#else // defined(MIXED_PRECISION)
Usama Arif0681e3b2019-04-25 14:28:07 +01002762 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002763#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002764
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002765#undef LHS_BLOCK_SIZE
2766#undef LHS_OFFSET_X
2767#undef LHS_STEP_X
2768#undef RHS_BLOCK_SIZE
2769#undef RHS_OFFSET_X
2770#undef RHS_STEP_X
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002771#undef LHS_STEP_LOOP
2772#undef RHS_STEP_LOOP
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002773}
giuros01b3204e72019-04-01 13:50:22 +01002774
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002775#if defined(OPENCL_IMAGE_SUPPORT)
2776/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
2777 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2778 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2779 *
2780 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2781 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2782 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2783 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
2784 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2785 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002786 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2787 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2788 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002789 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2790 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2791 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2792 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2793 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
2794 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2795 * - M0 = 2, 3, 4, 5, 6, 7, 8
2796 * - N0 = 4, 8, 16
2797 * - K0 = 4, 8, 16
2798 * - V0 >= 1
2799 * - H0 >= 1
2800 *
2801 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2802 * The activation function is performed after the bias addition
2803 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
2804 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2805 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2806 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2807 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2808 *
2809 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
2810 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2811 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2812 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2813 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2814 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2815 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2816 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2817 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2818 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2819 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2820 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2821 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2822 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2823 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2824 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2825 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2826 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2827 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002828 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002829 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2830 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2831 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2832 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2833 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2834 */
2835__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
2836 __read_only image2d_t rhs_img,
2837#if defined(BETA)
2838 IMAGE_DECLARATION(bias),
2839#endif // defined(BETA)
2840 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002841 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002842 uint lhs_stride_z,
2843 uint rhs_stride_z,
2844#if defined(BETA)
2845 uint bias_stride_z,
2846#endif //defined(BETA)
2847 uint dst_stride_z
2848#if defined(REINTERPRET_OUTPUT_AS_3D)
2849 ,
2850 uint dst_cross_plane_pad
2851#endif // REINTERPRET_OUTPUT_AS_3D
2852 )
2853{
2854 // Pixel unit
2855#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
2856
2857 // Block size
2858#define LHS_BLOCK_SIZE ((K0) * (M0))
2859
2860#if defined(LHS_INTERLEAVE)
2861#define LHS_OFFSET_X (K0)
2862#define LHS_STEP_X ((K0) * (V0))
2863#define LHS_STEP_LOOP (1)
2864#else // defined(INTERLEAVE)
2865#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2866#define LHS_STEP_X (K0)
2867#define LHS_STEP_LOOP (V0)
2868#endif // defined(INTERLEAVE)
2869
2870 // Block size
2871#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
2872
2873 // RHS offset and step X
2874#if defined(RHS_INTERLEAVE)
2875#define RHS_OFFSET_X (PIXEL_UNIT)
2876#define RHS_STEP_X (PIXEL_UNIT * (H0))
2877#define RHS_STEP_LOOP (1)
2878#else // defined(RHS_INTERLEAVE)
2879#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2880#define RHS_STEP_X PIXEL_UNIT
2881#define RHS_STEP_LOOP (H0)
2882#endif // defined(RHS_INTERLEAVE)
2883
2884#if defined(DUMMY_WORK_ITEMS)
2885 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2886 {
2887 return;
2888 }
2889#endif // defined(DUMMY_WORK_ITEMS)
2890
2891 // Compute LHS matrix address
2892 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2893 (get_global_id(2) * lhs_stride_z);
2894
2895#if defined(MATRIX_B_DEPTH)
2896 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2897 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
2898#else // defined(MATRIX_B_DEPTH)
2899 const uint z_rhs = get_global_id(2);
2900#endif // defined(MATRIX_B_DEPTH)
2901
2902 // Compute RHS matrix coordinates
2903 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
2904 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
2905
2906 // Initialize the accumulators
2907 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
2908
2909 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2910 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2911
2912 for(int i = 0; i < K; i += K0)
2913 {
2914 // Load values from LHS matrix
2915 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
2916
2917 // Load values from RHS matrix stored in a cl_image
2918 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
2919 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
2920
2921 // Accumulate
2922 ARM_DOT_K0XN0(a0, b, c0);
2923#if M0 > 1
2924 ARM_DOT_K0XN0(a1, b, c1);
2925#endif // M0 > 1
2926#if M0 > 2
2927 ARM_DOT_K0XN0(a2, b, c2);
2928#endif // M0 > 2
2929#if M0 > 3
2930 ARM_DOT_K0XN0(a3, b, c3);
2931#endif // M0 > 3
2932#if M0 > 4
2933 ARM_DOT_K0XN0(a4, b, c4);
2934#endif // M0 > 4
2935#if M0 > 5
2936 ARM_DOT_K0XN0(a5, b, c5);
2937#endif // M0 > 5
2938#if M0 > 6
2939 ARM_DOT_K0XN0(a6, b, c6);
2940#endif // M0 > 6
2941#if M0 > 7
2942 ARM_DOT_K0XN0(a7, b, c7);
2943#endif // M0 > 7
2944
2945 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2946
2947 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
2948 }
2949
2950 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2951
2952 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
2953
2954#if defined(REINTERPRET_OUTPUT_AS_3D)
2955
2956 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2957 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2958 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2959 // multiply dst_stride_z by DEPTH_GEMM3D
2960 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2961
2962#else // defined(REINTERPRET_OUTPUT_AS_3D)
2963
2964 // Add offset for batched GEMM
2965 dst_addr += get_global_id(2) * dst_stride_z;
2966
2967#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2968
2969 // Multiply by the weight of matrix-matrix product and store the result
2970#if defined(ALPHA)
2971 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2972#endif // defined(ALPHA)
2973
2974 // Add beta*bias
2975#if defined(BETA)
2976#if defined(BROADCAST_BIAS)
2977 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2978
2979 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2980
2981#ifndef UNIT_BETA
2982 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2983#endif // UNIT_BIAS
2984
2985 // c = c + bias[broadcasted]
2986#if defined(MIXED_PRECISION)
2987 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2988 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2989#else // defined(MIXED_PRECISION)
2990 ADD_BLOCK_BROADCAST(M0, c, bias0);
2991#endif // defined(MIXED_PRECISION)
2992
2993#else // defined(BROADCAST_BIAS)
2994 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2995 2) * bias_stride_z;
2996
2997 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2998
2999#ifndef UNIT_BETA
3000 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3001#endif // UNIT_BIAS
3002
3003 // c = c + bias
3004#if defined(MIXED_PRECISION)
3005 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3006 ADD_BLOCK(M0, c, bias_hp);
3007#else // defined(MIXED_PRECISION)
3008 ADD_BLOCK(M0, c, bias);
3009#endif // defined(MIXED_PRECISION)
3010
3011#endif // defined(BROADCAST_BIAS)
3012#endif // defined(BETA)
3013
3014#if defined(ACTIVATION_TYPE)
3015#if defined(MIXED_PRECISION)
3016 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3017#else // defined(MIXED_PRECISION)
3018 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
3019#endif // defined(MIXED_PRECISION)
3020#endif // defined(ACTIVATION_TYPE)
3021
3022 // Store output block
3023#if defined(MIXED_PRECISION)
3024 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3025#else // defined(MIXED_PRECISION)
3026 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3027#endif // defined(MIXED_PRECISION)
3028
3029#undef LHS_BLOCK_SIZE
3030#undef LHS_OFFSET_X
3031#undef LHS_STEP_X
3032#undef RHS_BLOCK_SIZE
3033#undef RHS_OFFSET_X
3034#undef RHS_STEP_X
3035#undef PIXEL_UNIT
3036#undef LHS_STEP_LOOP
3037#undef RHS_STEP_LOOP
3038}
3039#endif // defined(OPENCL_IMAGE_SUPPORT)
3040
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003041#if defined(LHS_TRANSPOSE)
3042
3043#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
3044
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003045#if defined(MIXED_PRECISION)
3046
3047#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3048#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003049#else // GPU_ARCH == GPU_ARCH_MIDGARD
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003050#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003051#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3052
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003053#else // defined(MIXED_PRECISION
3054
3055#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3056#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
3057#else // GPU_ARCH == GPU_ARCH_MIDGARD
3058#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
3059#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3060
3061#endif // defined(MIXED_PRECISION)
3062
3063#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
3064 ({ \
3065 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003066 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003067#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
3068 ({ \
3069 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
3070 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003071 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003072#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
3073 ({ \
3074 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
3075 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003076 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003077#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
3078 ({ \
3079 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
3080 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003081 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003082#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
3083 ({ \
3084 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
3085 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
3086 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
3087 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
3088 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003089 })
3090
3091// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
3092// a is the column-vector (transposed)
3093// b is the row-vector (not transposed)
3094// C is the output matrix
3095// Lower case is a vector (a, b)
3096// Upper case is a matrix (C)
3097#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
3098
3099#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
3100 ({ \
3101 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
3102 })
3103#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
3104 ({ \
3105 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
3106 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
3107 })
3108#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
3109 ({ \
3110 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
3111 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
3112 })
3113#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
3114 ({ \
3115 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
3116 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
3117 })
3118#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
3119 ({ \
3120 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
3121 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
3122 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
3123 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
3124 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
3125 })
3126#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
3127 ({ \
3128 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
3129 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
3130 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
3131 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
3132 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
3133 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
3134 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
3135 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
3136 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
3137 })
3138
3139// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
3140// The dimensions for this matrix multiplications are defined through M0, N0 and K0
3141// The dimensions supported are:
3142// M0: 1, 2, 3, 4, 8
3143// N0: 1, 2, 3, 4, 8, 16
3144// K0: 1, 2, 3, 4, 8, 16
3145// This macro calls the vector-by-matrix macro K0 times
3146// A, B and C are matrices
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003147#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
3148 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003149 (M0, N0, TYPE, A, B, C)
3150
3151/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3152 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3153 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3154 *
3155 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
3156 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003157 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003158 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3159 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3160 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3161 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3162 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
3163 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3164 * - M0 = 2, 3, 4, 8
3165 * - N0 = 2, 3, 4, 8, 16
3166 * - K0 = 2, 3, 4, 8, 16
3167 * - V0 >= 1
3168 * - H0 >= 1
3169 *
3170 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3171 * The activation function is performed after the bias addition
3172 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3173 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3174 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3175 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3176 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3177 *
3178 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
3179 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3180 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3181 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3182 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3183 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3184 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
3185 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
3186 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3187 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
3188 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3189 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
3190 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3191 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3192 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3193 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3194 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3195 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3196 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3197 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3198 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3199 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3200 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3201 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003202 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003203 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3204 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3205 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3206 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3207 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3208 */
3209__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
3210 IMAGE_DECLARATION(rhs),
3211#if defined(BETA)
3212 IMAGE_DECLARATION(bias),
3213#endif // defined(BETA)
3214 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003215 uint k,
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003216 uint lhs_stride_z,
3217 uint rhs_stride_z,
3218#if defined(BETA)
3219 uint bias_stride_z,
3220#endif //defined(BETA)
3221 uint dst_stride_z
3222#if defined(REINTERPRET_OUTPUT_AS_3D)
3223 ,
3224 uint dst_cross_plane_pad
3225#endif // REINTERPRET_OUTPUT_AS_3D
3226 )
3227{
3228 // Block size
3229#define LHS_BLOCK_SIZE ((K0) * (M0))
3230
3231#if defined(LHS_INTERLEAVE)
3232#define LHS_OFFSET_X (M0)
3233#define LHS_STEP_X ((M0) * (V0))
3234#define LHS_STEP_LOOP (1)
3235#else // defined(INTERLEAVE)
3236#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3237#define LHS_STEP_X (M0)
3238#define LHS_STEP_LOOP (V0)
3239#endif // defined(INTERLEAVE)
3240
3241 // Block size
3242#define RHS_BLOCK_SIZE ((K0) * (N0))
3243
3244 // RHS offset and step X
3245#if defined(RHS_INTERLEAVE)
3246#define RHS_OFFSET_X (N0)
3247#define RHS_STEP_X ((N0) * (H0))
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003248#else // defined(RHS_INTERLEAVE)
3249#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3250#define RHS_STEP_X (N0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003251#endif // defined(RHS_INTERLEAVE)
3252
3253 const uint x = get_global_id(0);
3254 const uint y = get_global_id(1);
3255 const uint z = get_global_id(2);
3256
3257#if defined(DUMMY_WORK_ITEMS)
3258 if((x * N0 >= N) || (y * M0 >= M))
3259 {
3260 return;
3261 }
3262#endif // defined(DUMMY_WORK_ITEMS)
3263
3264 // Compute LHS matrix address
3265 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3266
3267 // Compute RHS matrix address
3268 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
3269
3270#if defined(MATRIX_B_DEPTH)
3271 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3272 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
3273#else // defined(MATRIX_B_DEPTH)
3274 rhs_addr += z * rhs_stride_z;
3275#endif // defined(MATRIX_B_DEPTH)
3276
3277 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003278 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003279
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003280 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3281
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003282 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3283 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
3284
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003285 for(int i = 0; i < k; i += K0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003286 {
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003287 VEC_DATA_TYPE(DATA_TYPE, M0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003288 a0;
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003289 VEC_DATA_TYPE(DATA_TYPE, N0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003290 b0;
3291
3292 a0 = VLOAD(M0)(0, lhs);
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003293 b0 = VLOAD(N0)(0, rhs);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003294
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003295 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003296
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003297 lhs += LHS_STEP_X;
3298 rhs += RHS_STEP_X;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003299
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003300#if K0 > 1
3301 a0 = VLOAD(M0)(0, lhs);
3302 b0 = VLOAD(N0)(0, rhs);
3303
3304 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3305
3306 lhs += LHS_STEP_X;
3307 rhs += RHS_STEP_X;
3308#endif // K0 > 1
3309
3310#if K0 > 2
3311 a0 = VLOAD(M0)(0, lhs);
3312 b0 = VLOAD(N0)(0, rhs);
3313
3314 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3315
3316 lhs += LHS_STEP_X;
3317 rhs += RHS_STEP_X;
3318#endif // K0 > 2
3319
3320#if K0 > 3
3321 a0 = VLOAD(M0)(0, lhs);
3322 b0 = VLOAD(N0)(0, rhs);
3323
3324 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3325
3326 lhs += LHS_STEP_X;
3327 rhs += RHS_STEP_X;
3328#endif // K0 > 3
3329
3330#if K0 > 4
3331 a0 = VLOAD(M0)(0, lhs);
3332 b0 = VLOAD(N0)(0, rhs);
3333
3334 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3335
3336 lhs += LHS_STEP_X;
3337 rhs += RHS_STEP_X;
3338
3339 a0 = VLOAD(M0)(0, lhs);
3340 b0 = VLOAD(N0)(0, rhs);
3341
3342 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3343
3344 lhs += LHS_STEP_X;
3345 rhs += RHS_STEP_X;
3346
3347 a0 = VLOAD(M0)(0, lhs);
3348 b0 = VLOAD(N0)(0, rhs);
3349
3350 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3351
3352 lhs += LHS_STEP_X;
3353 rhs += RHS_STEP_X;
3354
3355 a0 = VLOAD(M0)(0, lhs);
3356 b0 = VLOAD(N0)(0, rhs);
3357
3358 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3359
3360 lhs += LHS_STEP_X;
3361 rhs += RHS_STEP_X;
3362#endif // K0 > 4
3363
3364#if K0 > 8
3365 a0 = VLOAD(M0)(0, lhs);
3366 b0 = VLOAD(N0)(0, rhs);
3367
3368 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3369
3370 lhs += LHS_STEP_X;
3371 rhs += RHS_STEP_X;
3372
3373 a0 = VLOAD(M0)(0, lhs);
3374 b0 = VLOAD(N0)(0, rhs);
3375
3376 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3377
3378 lhs += LHS_STEP_X;
3379 rhs += RHS_STEP_X;
3380
3381 a0 = VLOAD(M0)(0, lhs);
3382 b0 = VLOAD(N0)(0, rhs);
3383
3384 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3385
3386 lhs += LHS_STEP_X;
3387 rhs += RHS_STEP_X;
3388
3389 a0 = VLOAD(M0)(0, lhs);
3390 b0 = VLOAD(N0)(0, rhs);
3391
3392 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3393
3394 lhs += LHS_STEP_X;
3395 rhs += RHS_STEP_X;
3396
3397 a0 = VLOAD(M0)(0, lhs);
3398 b0 = VLOAD(N0)(0, rhs);
3399
3400 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3401
3402 lhs += LHS_STEP_X;
3403 rhs += RHS_STEP_X;
3404
3405 a0 = VLOAD(M0)(0, lhs);
3406 b0 = VLOAD(N0)(0, rhs);
3407
3408 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3409
3410 lhs += LHS_STEP_X;
3411 rhs += RHS_STEP_X;
3412
3413 a0 = VLOAD(M0)(0, lhs);
3414 b0 = VLOAD(N0)(0, rhs);
3415
3416 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3417
3418 lhs += LHS_STEP_X;
3419 rhs += RHS_STEP_X;
3420
3421 a0 = VLOAD(M0)(0, lhs);
3422 b0 = VLOAD(N0)(0, rhs);
3423
3424 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3425
3426 lhs += LHS_STEP_X;
3427 rhs += RHS_STEP_X;
3428#endif // K0 > 8
3429
3430#ifndef LHS_INTERLEAVE
3431 lhs += (M0 * K0 * (V0 - 1));
3432#endif // LHS_INTERLEAVE
3433
3434#ifndef RHS_INTERLEAVE
3435 rhs += (N0 * K0 * (H0 - 1));
3436#endif // RHS_INTERLEAVE
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003437 }
3438
3439 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3440
3441 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3442
3443#if defined(REINTERPRET_OUTPUT_AS_3D)
3444
3445 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3446 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3447 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3448 // multiply dst_stride_z by DEPTH_GEMM3D
3449 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3450
3451#else // defined(REINTERPRET_OUTPUT_AS_3D)
3452
3453 // Add offset for batched GEMM
3454 dst_addr += z * dst_stride_z;
3455
3456#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3457
3458 // Multiply by the weight of matrix-matrix product and store the result
3459#if defined(ALPHA)
3460 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3461#endif // defined(ALPHA)
3462
3463 // Add beta*bias
3464#if defined(BETA)
3465#if defined(BROADCAST_BIAS)
3466 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3467
3468 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3469
3470#ifndef UNIT_BETA
3471 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3472#endif // UNIT_BIAS
3473
3474 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003475#if defined(MIXED_PRECISION)
3476 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3477 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3478#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003479 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003480#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003481
3482#else // defined(BROADCAST_BIAS)
3483 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3484
3485 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3486
3487#ifndef UNIT_BETA
3488 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3489#endif // UNIT_BIAS
3490
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003491#if defined(MIXED_PRECISION)
3492 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3493 ADD_BLOCK(M0, c, bias_hp);
3494#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003495 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003496#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003497
3498#endif // defined(BROADCAST_BIAS)
3499#endif // defined(BETA)
3500
3501#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003502#if defined(MIXED_PRECISION)
3503 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3504#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003505 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003506#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003507#endif // defined(ACTIVATION_TYPE)
3508
3509 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003510#if defined(MIXED_PRECISION)
3511 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3512#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003513 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003514#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003515
3516#undef LHS_BLOCK_SIZE
3517#undef LHS_OFFSET_X
3518#undef LHS_STEP_X
3519#undef RHS_BLOCK_SIZE
3520#undef RHS_OFFSET_X
3521#undef RHS_STEP_X
3522}
3523
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003524#if defined(OPENCL_IMAGE_SUPPORT)
3525/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
3526 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3527 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3528 *
3529 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
3530 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003531 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
3532 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01003533 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
3534 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
3535 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003536 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3537 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3538 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3539 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3540 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
3541 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3542 * - M0 = 2, 3, 4, 8
3543 * - N0 = 4, 8, 16
3544 * - K0 = 4, 8, 16
3545 * - V0 >= 1
3546 * - H0 >= 1
3547 *
3548 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3549 * The activation function is performed after the bias addition
3550 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3551 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3552 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3553 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3554 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3555 *
3556 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
3557 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3558 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3559 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3560 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3561 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3562 * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
3563 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3564 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3565 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3566 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3567 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3568 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3569 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3574 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003575 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003576 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3577 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3578 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3579 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3580 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3581 */
3582__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
3583 __read_only image2d_t rhs_img,
3584#if defined(BETA)
3585 IMAGE_DECLARATION(bias),
3586#endif // defined(BETA)
3587 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003588 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003589 uint lhs_stride_z,
3590 uint rhs_stride_z,
3591#if defined(BETA)
3592 uint bias_stride_z,
3593#endif //defined(BETA)
3594 uint dst_stride_z
3595#if defined(REINTERPRET_OUTPUT_AS_3D)
3596 ,
3597 uint dst_cross_plane_pad
3598#endif // REINTERPRET_OUTPUT_AS_3D
3599 )
3600{
3601 // Pixel unit
3602#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
3603
3604 // Block size
3605#define LHS_BLOCK_SIZE ((K0) * (M0))
3606
3607#if defined(LHS_INTERLEAVE)
3608#define LHS_OFFSET_X (M0)
3609#define LHS_STEP_X ((M0) * (V0))
3610#define LHS_STEP_LOOP (1)
3611#else // defined(INTERLEAVE)
3612#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3613#define LHS_STEP_X (M0)
3614#define LHS_STEP_LOOP (V0)
3615#endif // defined(INTERLEAVE)
3616
3617 // Block size
3618#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
3619
3620 // RHS offset and step X
3621#if defined(RHS_INTERLEAVE)
3622#define RHS_OFFSET_X (PIXEL_UNIT)
3623#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
3624#else // defined(RHS_INTERLEAVE)
3625#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3626#define RHS_STEP_X (PIXEL_UNIT)
3627#endif // defined(RHS_INTERLEAVE)
3628
3629 const uint x = get_global_id(0);
3630 const uint y = get_global_id(1);
3631 const uint z = get_global_id(2);
3632
3633#if defined(DUMMY_WORK_ITEMS)
3634 if((x * N0 >= N) || (y * M0 >= M))
3635 {
3636 return;
3637 }
3638#endif // defined(DUMMY_WORK_ITEMS)
3639
3640 // Compute LHS matrix address
3641 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3642
3643#if defined(MATRIX_B_DEPTH)
3644 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3645 const uint z_rhs = (z % MATRIX_B_DEPTH);
3646#else // defined(MATRIX_B_DEPTH)
3647 const uint z_rhs = z;
3648#endif // defined(MATRIX_B_DEPTH)
3649
3650 // Compute RHS matrix coordinates
3651 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
3652 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
3653
3654 // Initialize the accumulators
3655 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
3656
3657 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3658
3659 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3660
3661 for(int i = 0; i < K; i += K0)
3662 {
3663 VEC_DATA_TYPE(DATA_TYPE, M0)
3664 a0;
3665 VEC_DATA_TYPE(DATA_TYPE, N0)
3666 b0;
3667
3668 a0 = VLOAD(M0)(0, lhs);
3669 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
3670
3671 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3672
3673 lhs += LHS_STEP_X;
3674
3675#if K0 > 1
3676 a0 = VLOAD(M0)(0, lhs);
3677 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
3678
3679 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3680
3681 lhs += LHS_STEP_X;
3682#endif // K0 > 1
3683
3684#if K0 > 2
3685 a0 = VLOAD(M0)(0, lhs);
3686 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
3687
3688 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3689
3690 lhs += LHS_STEP_X;
3691#endif // K0 > 2
3692
3693#if K0 > 3
3694 a0 = VLOAD(M0)(0, lhs);
3695 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
3696
3697 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3698
3699 lhs += LHS_STEP_X;
3700#endif // K0 > 3
3701
3702#if K0 > 4
3703 a0 = VLOAD(M0)(0, lhs);
3704 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
3705
3706 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3707
3708 lhs += LHS_STEP_X;
3709
3710 a0 = VLOAD(M0)(0, lhs);
3711 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
3712
3713 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3714
3715 lhs += LHS_STEP_X;
3716
3717 a0 = VLOAD(M0)(0, lhs);
3718 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
3719
3720 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3721
3722 lhs += LHS_STEP_X;
3723
3724 a0 = VLOAD(M0)(0, lhs);
3725 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
3726
3727 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3728
3729 lhs += LHS_STEP_X;
3730#endif // K0 > 4
3731
3732#if K0 > 8
3733 a0 = VLOAD(M0)(0, lhs);
3734 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
3735
3736 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3737
3738 lhs += LHS_STEP_X;
3739
3740 a0 = VLOAD(M0)(0, lhs);
3741 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
3742
3743 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3744
3745 lhs += LHS_STEP_X;
3746
3747 a0 = VLOAD(M0)(0, lhs);
3748 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
3749
3750 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3751
3752 lhs += LHS_STEP_X;
3753
3754 a0 = VLOAD(M0)(0, lhs);
3755 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
3756
3757 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3758
3759 lhs += LHS_STEP_X;
3760
3761 a0 = VLOAD(M0)(0, lhs);
3762 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
3763
3764 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3765
3766 lhs += LHS_STEP_X;
3767
3768 a0 = VLOAD(M0)(0, lhs);
3769 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
3770
3771 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3772
3773 lhs += LHS_STEP_X;
3774
3775 a0 = VLOAD(M0)(0, lhs);
3776 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
3777
3778 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3779
3780 lhs += LHS_STEP_X;
3781
3782 a0 = VLOAD(M0)(0, lhs);
3783 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
3784
3785 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3786
3787 lhs += LHS_STEP_X;
3788#endif // K0 > 8
3789
3790#ifndef LHS_INTERLEAVE
3791 lhs += (M0 * K0 * (V0 - 1));
3792#endif // LHS_INTERLEAVE
3793
3794 x_rhs += K0 * RHS_STEP_X;
3795#ifndef RHS_INTERLEAVE
3796 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
3797#endif // RHS_INTERLEAVE
3798 }
3799
3800 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3801
3802 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3803
3804#if defined(REINTERPRET_OUTPUT_AS_3D)
3805
3806 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
3807 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
3808 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3809 // multiply dst_stride_z by DEPTH_GEMM3D
3810 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3811
3812#else // defined(REINTERPRET_OUTPUT_AS_3D)
3813
3814 // Add offset for batched GEMM
3815 dst_addr += z * dst_stride_z;
3816
3817#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3818
3819 // Multiply by the weight of matrix-matrix product and store the result
3820#if defined(ALPHA)
3821 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3822#endif // defined(ALPHA)
3823
3824 // Add beta*bias
3825#if defined(BETA)
3826#if defined(BROADCAST_BIAS)
3827 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3828
3829 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3830
3831#ifndef UNIT_BETA
3832 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3833#endif // UNIT_BIAS
3834
3835 // c = c + bias[broadcasted]
3836#if defined(MIXED_PRECISION)
3837 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3838 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3839#else // defined(MIXED_PRECISION)
3840 ADD_BLOCK_BROADCAST(M0, c, bias0);
3841#endif // defined(MIXED_PRECISION)
3842
3843#else // defined(BROADCAST_BIAS)
3844 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3845
3846 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3847
3848#ifndef UNIT_BETA
3849 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3850#endif // UNIT_BIAS
3851
3852#if defined(MIXED_PRECISION)
3853 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3854 ADD_BLOCK(M0, c, bias_hp);
3855#else // defined(MIXED_PRECISION)
3856 ADD_BLOCK(M0, c, bias);
3857#endif // defined(MIXED_PRECISION)
3858
3859#endif // defined(BROADCAST_BIAS)
3860#endif // defined(BETA)
3861
3862#if defined(ACTIVATION_TYPE)
3863#if defined(MIXED_PRECISION)
3864 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
3865#else // defined(MIXED_PRECISION)
3866 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
3867#endif // defined(MIXED_PRECISION)
3868#endif // defined(ACTIVATION_TYPE)
3869
3870 // Store output block
3871#if defined(MIXED_PRECISION)
3872 CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3873#else // defined(MIXED_PRECISION)
3874 STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
3875#endif // defined(MIXED_PRECISION)
3876
3877#undef LHS_BLOCK_SIZE
3878#undef LHS_OFFSET_X
3879#undef LHS_STEP_X
3880#undef RHS_BLOCK_SIZE
3881#undef RHS_OFFSET_X
3882#undef RHS_STEP_X
3883#undef PIXEL_UNIT
3884#undef LHS_STEP_LOOP
3885#undef RHS_STEP_LOOP
3886}
3887#endif // defined(OPENCL_IMAGE_SUPPORT)
3888
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003889#endif // defined(LHS_TRANSPOSE)
3890
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00003891#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
3892
giuros01b3204e72019-04-01 13:50:22 +01003893#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
3894
3895#define VFMA(a, b, c) \
3896 ({ \
3897 c = fma(a, b, c); \
3898 })
3899
3900#if M0 == 1
3901#define RHS_VFMA_M0xN0(i, a, b, c) \
3902 ({ \
3903 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3904 })
3905#elif M0 == 2 // M0 == 2
3906#define RHS_VFMA_M0xN0(i, a, b, c) \
3907 ({ \
3908 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3909 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3910 })
3911#elif M0 == 3 // M0 == 3
3912#define RHS_VFMA_M0xN0(i, a, b, c) \
3913 ({ \
3914 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3915 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3916 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3917 })
3918#elif M0 == 4 // M0 == 4
3919#define RHS_VFMA_M0xN0(i, a, b, c) \
3920 ({ \
3921 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3922 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3923 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3924 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3925 })
3926#elif M0 == 5 // M0 == 5
3927#define RHS_VFMA_M0xN0(i, a, b, c) \
3928 ({ \
3929 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3930 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3931 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3932 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3933 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3934 })
3935#elif M0 == 6 // M0 == 6
3936#define RHS_VFMA_M0xN0(i, a, b, c) \
3937 ({ \
3938 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3939 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3940 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3941 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3942 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3943 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3944 })
3945#elif M0 == 7 // M0 == 7
3946#define RHS_VFMA_M0xN0(i, a, b, c) \
3947 ({ \
3948 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3949 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3950 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3951 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3952 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3953 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3954 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3955 })
3956#elif M0 == 8 // M0 == 8
3957#define RHS_VFMA_M0xN0(i, a, b, c) \
3958 ({ \
3959 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3960 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3961 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3962 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3963 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3964 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3965 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3966 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
3967 })
3968#else // M0 not supported
3969#error "M0 not supported"
3970#endif // M0 not supported
3971
3972/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3973 * The LHS matrix is NOT reshaped
3974 * The RHS matrix is NOT reshaped
3975 *
3976 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01003977 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
3978 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
3979 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
3980 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
3981 * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
SiCong Li3a501662020-06-26 10:02:06 +01003982 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3983 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
giuros01b3204e72019-04-01 13:50:22 +01003984 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3985 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
3986 * - N0 = 2, 3, 4, 8, 16
3987 * - K0 = 2, 3, 4, 8, 16
3988 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01003989 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01003990 * The activation function is performed after the bias addition
giuros01b3204e72019-04-01 13:50:22 +01003991 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
3992 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
3993 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3994 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3995 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3996 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
3997 *
Gian Marco Iodice944170e2019-06-24 14:40:30 +01003998 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
3999 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
4000 * @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)
4001 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
4002 * @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)
4003 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
4004 * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
4005 * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
4006 * @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)
4007 * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
4008 * @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)
4009 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004010 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4011 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4012 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
4013 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4014 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
4015 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
4016 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
4017 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4018 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
4019 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4020 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
4021 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4022 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
4023 * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
4024 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
4025 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4026 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
4027 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
giuros01b3204e72019-04-01 13:50:22 +01004028 */
4029__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
4030 IMAGE_DECLARATION(rhs),
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004031#if defined(BETA)
4032 IMAGE_DECLARATION(bias),
4033#endif // defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004034 IMAGE_DECLARATION(dst),
4035 uint lhs_stride_z,
4036 uint rhs_stride_z,
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004037#if defined(BETA)
4038 uint bias_stride_z,
4039#endif //defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004040 uint dst_stride_z
4041#if defined(REINTERPRET_INPUT_AS_3D)
4042 ,
4043 uint lhs_cross_plane_pad
4044#endif // REINTERPRET_INPUT_AS_3D
4045#if defined(REINTERPRET_OUTPUT_AS_3D)
4046 ,
4047 uint dst_cross_plane_pad
4048#endif // REINTERPRET_OUTPUT_AS_3D
4049 )
4050{
4051 // Block size
4052#define RHS_BLOCK_SIZE ((K0) * (N0))
4053
4054 // RHS offset and step X
4055#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
4056
4057 uint x = get_global_id(0);
4058 uint y = get_global_id(1);
4059 uint z = get_global_id(2);
4060
4061#if defined(DUMMY_WORK_ITEMS)
4062 if((x * N0 >= N) || (y * M0 >= M))
4063 {
4064 return;
4065 }
4066#endif // defined(DUMMY_WORK_ITEMS)
4067
4068 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01004069 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
giuros01b3204e72019-04-01 13:50:22 +01004070
4071 // Compute RHS matrix address
4072 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
4073
4074#if defined(MATRIX_B_DEPTH)
4075 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4076 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
4077#else // defined(MATRIX_B_DEPTH)
4078 rhs_offset += z * rhs_stride_z;
4079#endif // defined(MATRIX_B_DEPTH)
4080
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004081 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
4082 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
giuros01b3204e72019-04-01 13:50:22 +01004083
4084#if defined(REINTERPRET_INPUT_AS_3D)
4085 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4086 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
4087
4088 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4089 // multiply lhs_stride_z by DEPTH_GEMM3D
4090 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
4091
4092#else // defined(REINTERPRET_INPUT_AS_3D)
4093
4094 // Add offset for batched GEMM
4095 lhs_offset += z * lhs_stride_z;
4096
4097#endif // defined(REINTERPRET_INPUT_AS_3D)
4098
4099 // Initialize the accumulators
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004100 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
giuros01b3204e72019-04-01 13:50:22 +01004101
4102 int i = 0;
4103 for(; i <= (K - K0); i += K0)
4104 {
4105 // Supported cases (M0, K0):
4106 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
4107 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
4108 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
4109 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
4110 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
4111 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
4112 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
4113 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
4114 // Load values from LHS matrix
4115 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
4116
4117 // Load values from RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004118 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
giuros01b3204e72019-04-01 13:50:22 +01004119
4120 RHS_VFMA_M0xN0(0, a, b0, c);
4121 RHS_VFMA_M0xN0(1, a, b1, c);
4122#if K0 > 2
4123 RHS_VFMA_M0xN0(2, a, b2, c);
4124#endif // K0 > 2
4125#if K0 > 3
4126 RHS_VFMA_M0xN0(3, a, b3, c);
4127#endif // K0 > 3
4128#if K0 > 4
4129 RHS_VFMA_M0xN0(4, a, b4, c);
4130 RHS_VFMA_M0xN0(5, a, b5, c);
4131 RHS_VFMA_M0xN0(6, a, b6, c);
4132 RHS_VFMA_M0xN0(7, a, b7, c);
4133#endif // K0 > 4
4134#if K0 > 8
4135 RHS_VFMA_M0xN0(8, a, b8, c);
4136 RHS_VFMA_M0xN0(9, a, b9, c);
Gian Marco Iodice7b9d7ca2019-09-19 16:37:39 +01004137 RHS_VFMA_M0xN0(A, a, bA, c);
4138 RHS_VFMA_M0xN0(B, a, bB, c);
4139 RHS_VFMA_M0xN0(C, a, bC, c);
4140 RHS_VFMA_M0xN0(D, a, bD, c);
4141 RHS_VFMA_M0xN0(E, a, bE, c);
4142 RHS_VFMA_M0xN0(F, a, bF, c);
giuros01b3204e72019-04-01 13:50:22 +01004143#endif // K0 > 8
4144
4145 lhs_offset += K0 * sizeof(DATA_TYPE);
4146 rhs_offset += K0 * rhs_stride_y;
4147 }
4148
4149 // Left-over accumulations
4150 for(; i < K; ++i)
4151 {
4152 // Load values from LHS matrix
4153 VEC_DATA_TYPE(DATA_TYPE, 2)
4154 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
4155#if M0 > 1
4156 VEC_DATA_TYPE(DATA_TYPE, 2)
4157 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
4158#endif // M0 > 1
4159#if M0 > 2
4160 VEC_DATA_TYPE(DATA_TYPE, 2)
4161 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
4162#endif // M0 > 2
4163#if M0 > 3
4164 VEC_DATA_TYPE(DATA_TYPE, 2)
4165 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
4166#endif // M0 > 3
4167#if M0 > 4
4168 VEC_DATA_TYPE(DATA_TYPE, 2)
4169 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
4170#endif // M0 > 4
4171#if M0 > 5
4172 VEC_DATA_TYPE(DATA_TYPE, 2)
4173 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
4174#endif // M0 > 5
4175#if M0 > 6
4176 VEC_DATA_TYPE(DATA_TYPE, 2)
4177 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
4178#endif // M0 > 6
4179#if M0 > 7
4180 VEC_DATA_TYPE(DATA_TYPE, 2)
4181 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
4182#endif // M0 > 7
4183
4184 VEC_DATA_TYPE(DATA_TYPE, N0)
4185 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
4186 RHS_VFMA_M0xN0(0, a, b, c);
4187
4188 lhs_offset += sizeof(DATA_TYPE);
4189 rhs_offset += rhs_stride_y;
4190 }
4191
SiCong Li406a13f2020-07-15 12:09:58 +01004192 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004193
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004194 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
giuros01b3204e72019-04-01 13:50:22 +01004195
4196#if defined(REINTERPRET_OUTPUT_AS_3D)
4197 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
4198 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
4199
4200 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4201 // multiply dst_stride_z by DEPTH_GEMM3D
4202 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
4203
4204#else // defined(REINTERPRET_OUTPUT_AS_3D)
4205
4206 // Add offset for batched GEMM
4207 dst_addr += z * dst_stride_z;
4208
4209#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4210
4211 // Multiply by the weight of matrix-matrix product and store the result
giuros01b3204e72019-04-01 13:50:22 +01004212#if defined(ALPHA)
4213 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
4214#endif // defined(ALPHA)
4215
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004216 // Add beta*bias
4217#if defined(BETA)
4218#if defined(BROADCAST_BIAS)
4219 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
4220
4221 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4222
4223#ifndef UNIT_BETA
4224 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
4225#endif // UNIT_BIAS
4226
4227 // c = c + bias[broadcasted]
4228 ADD_BLOCK_BROADCAST(M0, c, bias0);
4229
4230#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01004231 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004232
4233 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4234
4235#ifndef UNIT_BETA
4236 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
4237#endif // UNIT_BIAS
4238
4239 // c = c + bias
4240 ADD_BLOCK(M0, c, bias);
4241
4242#endif // defined(BROADCAST_BIAS)
4243#endif // defined(BETA)
4244
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004245#if defined(ACTIVATION_TYPE)
4246 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
4247#endif // defined(ACTIVATION_TYPE)
4248
giuros01b3204e72019-04-01 13:50:22 +01004249 // Store output block
SiCong Li406a13f2020-07-15 12:09:58 +01004250 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
giuros01b3204e72019-04-01 13:50:22 +01004251
4252#undef RHS_BLOCK_SIZE
4253#undef RHS_OFFSET_X
4254#undef RHS_STEP_X
4255}
4256#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
4257
Gian Marco36a0a462018-01-12 10:21:40 +00004258#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004259/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004260 *
Gian Marco19835e52018-01-30 13:35:54 +00004261 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004262 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4263 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4264 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4265 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004266 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004267 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4268 * The activation function is performed after the bias addition
4269 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004270 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4271 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4272 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4273 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4274 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004275 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4276 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4277 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4278 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4279 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4280 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004281 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004282 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4283 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4284 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4285 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4286 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004287 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4288 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4289 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4290 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4291 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4292 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004293 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004294 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004295 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004296 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004297 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004298 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004299 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4300 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004301 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004302 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004303 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004304 */
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004305__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
4306 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004307#if defined(BETA)
4308 IMAGE_DECLARATION(src2),
4309#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01004310 IMAGE_DECLARATION(dst),
4311 uint src0_stride_z,
4312 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004313#if defined(BETA)
4314 uint src2_stride_z,
4315#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004316 uint dst_stride_z
4317#if defined(REINTERPRET_OUTPUT_AS_3D)
4318 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004319 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004320#endif // REINTERPRET_OUTPUT_AS_3D
4321 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004322{
Gian Marco36a0a462018-01-12 10:21:40 +00004323 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4324 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004325 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004326
Gian Marco36a0a462018-01-12 10:21:40 +00004327 // Offset
4328 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4329 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004330
Gian Marco36a0a462018-01-12 10:21:40 +00004331 // src_addr_a = address of matrix A
4332 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004333 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4334 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4335
4336#if defined(MATRIX_B_DEPTH)
4337 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4338 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4339#else // defined(MATRIX_B_DEPTH)
4340 src1_addr_in_bytes += z * src1_stride_z;
4341#endif // defined(MATRIX_B_DEPTH)
4342
4343 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4344 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004345
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004346 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004347 __global float *src_end_addr_b = src_addr_b + COLS_B;
4348
4349 src_addr_a += offset_row_a;
4350 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004351
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004352 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004353 float4 c0 = 0.0f;
4354 float4 c1 = 0.0f;
4355 float4 c2 = 0.0f;
4356 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004357
Gian Marco36a0a462018-01-12 10:21:40 +00004358 for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004359 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004360 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004361 float4 a0 = vload4(0, src_addr_a);
4362 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004363
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004364 c0 += (float4)a0.s0 * b0;
4365 c1 += (float4)a0.s1 * b0;
4366 c2 += (float4)a0.s2 * b0;
4367 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004368
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004369 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004370 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4371 b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004372
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004373 c0 += (float4)a0.s0 * b0;
4374 c1 += (float4)a0.s1 * b0;
4375 c2 += (float4)a0.s2 * b0;
4376 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004377 }
4378
Gian Marco36a0a462018-01-12 10:21:40 +00004379 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004380 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004381 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004382 float4 a0 = vload4(0, src_addr_a);
4383 float4 b0 = vload4(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004384
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004385 c0 += (float4)a0.s0 * b0;
4386 c1 += (float4)a0.s1 * b0;
4387 c2 += (float4)a0.s2 * b0;
4388 c3 += (float4)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004389 }
4390
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004391 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004392 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4393
Gian Marcoae2af742018-02-15 12:35:44 +00004394 // Compute dst address
4395 __global uchar *dst_addr = offset(&dst, 0, 0);
4396
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004397 uint4 zout = 0;
4398
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004399#if defined(REINTERPRET_OUTPUT_AS_3D)
4400 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004401 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004402 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004403 // | |
4404 // | plane0 |
4405 // | |
4406 // |__________________|
4407 // |******************|
4408 // | cross_plane_pad |
4409 // |******************|
4410 // | |
4411 // | plane1 |
4412 // | |
4413 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004414
4415 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004416 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4417 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004418
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004419 // Add offset due to the cross plane paddings
4420 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004421
4422 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4423 // multiply dst_stride_z by DEPTH_GEMM3D
4424 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004425#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004426 // Add offset for batched GEMM
4427 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004428#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4429
4430 // Multiply by the weight of matrix-matrix product and store the result
4431#if defined(ALPHA)
4432 SCALE_BLOCK(4, float, c, ALPHA);
4433#endif // defined(ALPHA)
4434
4435 // Add beta*bias
4436#if defined(BETA)
4437 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4438
4439#if defined(BROADCAST_BIAS)
4440 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4441
4442 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4443
4444#ifndef UNIT_BETA
4445 SCALE_BLOCK(1, float, bias, BETA);
4446#endif // UNIT_BIAS
4447
4448 // c = c + bias[broadcasted]
4449 ADD_BLOCK_BROADCAST(4, c, bias0);
4450
4451#else // defined(BROADCAST_BIAS)
4452 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4453 2) * src2_stride_z;
4454
4455 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4456
4457#ifndef UNIT_BETA
4458 SCALE_BLOCK(4, float, bias, BETA);
4459#endif // UNIT_BIAS
4460
4461 // c = c + bias
4462 ADD_BLOCK(4, c, bias);
4463
4464#endif // defined(BROADCAST_BIAS)
4465#endif // defined(BETA)
4466
4467#if defined(ACTIVATION_TYPE)
4468 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
4469#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004470
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004471 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004472 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4473 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4474 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4475 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004476}
4477
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004478/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004479 *
Gian Marco19835e52018-01-30 13:35:54 +00004480 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004481 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4482 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4483 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4484 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4485 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004486 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004487 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4488 * The activation function is performed after the bias addition
4489 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004490 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4491 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4492 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4493 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4494 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004495 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
4496 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4497 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4498 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4499 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4500 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004501 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004502 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4503 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4504 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4505 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4506 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004507 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4508 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4509 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4510 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4511 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4512 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004513 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004514 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004515 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004516 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004517 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004518 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004519 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4520 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004521 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004522 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004523 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004524 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004525__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
4526 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004527#if defined(BETA)
4528 IMAGE_DECLARATION(src2),
4529#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004530 IMAGE_DECLARATION(dst),
4531 uint src0_stride_z,
4532 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004533#if defined(BETA)
4534 uint src2_stride_z,
4535#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004536 uint dst_stride_z
4537#if defined(REINTERPRET_OUTPUT_AS_3D)
4538 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004539 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004540#endif // REINTERPRET_OUTPUT_AS_3D
4541 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004542{
Gian Marco36a0a462018-01-12 10:21:40 +00004543 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4544 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004545 int z = get_global_id(2);
Gian Marco36a0a462018-01-12 10:21:40 +00004546
4547 // Offset
4548 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4549 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
4550
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004551 // src_addr_a = address of matrix A
4552 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004553 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4554 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4555
4556#if defined(MATRIX_B_DEPTH)
4557 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4558 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4559#else // defined(MATRIX_B_DEPTH)
4560 src1_addr_in_bytes += z * src1_stride_z;
4561#endif // defined(MATRIX_B_DEPTH)
4562
4563 __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
4564 __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004565
Gian Marco36a0a462018-01-12 10:21:40 +00004566 src_addr_a += offset_row_a;
4567 src_addr_b += offset_row_b;
4568
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004569 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004570 float4 c0 = 0.0f;
4571 float4 c1 = 0.0f;
4572 float4 c2 = 0.0f;
4573 float4 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004574
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004575#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
4576
4577 int i = 0;
4578 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004579 {
4580 // Load values from matrix A (interleaved) and matrix B (transposed)
4581 float4 a0 = vload4(0, src_addr_a);
4582 float4 b0 = vload4(0, src_addr_b);
4583
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004584 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4585 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004586
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004587 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4588 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4589 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4590 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004591
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004592 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4593 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4594 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4595 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004596
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004597 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4598 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4599 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4600 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004601
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004602 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4603 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4604 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4605 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004606
4607 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004608 a0 = vload4(0, src_addr_a);
4609 b0 = vload4(0, src_addr_b);
4610
4611 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4612 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004613
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004614 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4615 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4616 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4617 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004618
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004619 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4620 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4621 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4622 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004623
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004624 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4625 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4626 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4627 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004628
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004629 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4630 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4631 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4632 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004633
4634 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004635 a0 = vload4(0, src_addr_a);
4636 b0 = vload4(0, src_addr_b);
4637
4638 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4639 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4640
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004641 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4642 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4643 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4644 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004645
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004646 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4647 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4648 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4649 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004650
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004651 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4652 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4653 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4654 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004655
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004656 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4657 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4658 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4659 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004660
4661 // Load values from matrix A (interleaved) and matrix B (transposed)
4662 a0 = vload4(0, src_addr_a);
4663 b0 = vload4(0, src_addr_b);
4664
4665 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4666 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004667
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004668 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4669 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4670 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4671 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004672
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004673 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4674 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4675 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4676 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004677
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004678 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4679 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4680 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4681 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004682
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004683 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4684 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4685 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4686 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004687 }
4688
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004689 for(; i < (int)(COLS_MTX_B); ++i)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004690 {
4691 // Load values from matrix A (interleaved) and matrix B (transposed)
4692 float4 a0 = vload4(0, src_addr_a);
4693 float4 b0 = vload4(0, src_addr_b);
4694
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01004695 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
4696 src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
4697
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004698 c0.s0 = fma(a0.s0, b0.s0, c0.s0);
4699 c0.s1 = fma(a0.s0, b0.s1, c0.s1);
4700 c0.s2 = fma(a0.s0, b0.s2, c0.s2);
4701 c0.s3 = fma(a0.s0, b0.s3, c0.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004702
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004703 c1.s0 = fma(a0.s1, b0.s0, c1.s0);
4704 c1.s1 = fma(a0.s1, b0.s1, c1.s1);
4705 c1.s2 = fma(a0.s1, b0.s2, c1.s2);
4706 c1.s3 = fma(a0.s1, b0.s3, c1.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004707
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004708 c2.s0 = fma(a0.s2, b0.s0, c2.s0);
4709 c2.s1 = fma(a0.s2, b0.s1, c2.s1);
4710 c2.s2 = fma(a0.s2, b0.s2, c2.s2);
4711 c2.s3 = fma(a0.s2, b0.s3, c2.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004712
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004713 c3.s0 = fma(a0.s3, b0.s0, c3.s0);
4714 c3.s1 = fma(a0.s3, b0.s1, c3.s1);
4715 c3.s2 = fma(a0.s3, b0.s2, c3.s2);
4716 c3.s3 = fma(a0.s3, b0.s3, c3.s3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004717 }
4718
4719 // Compute destination address
4720 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4721
Gian Marcoae2af742018-02-15 12:35:44 +00004722 // Compute dst address
4723 __global uchar *dst_addr = offset(&dst, 0, 0);
4724
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004725 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004726
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004727#if defined(REINTERPRET_OUTPUT_AS_3D)
4728 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004729 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004730 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004731 // | |
4732 // | plane0 |
4733 // | |
4734 // |__________________|
4735 // |******************|
4736 // | cross_plane_pad |
4737 // |******************|
4738 // | |
4739 // | plane1 |
4740 // | |
4741 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004742
4743 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004744 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4745 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004746
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004747 // Add offset due to the cross plane paddings
4748 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004749
4750 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4751 // multiply dst_stride_z by DEPTH_GEMM3D
4752 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004753#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004754 // Add offset for batched GEMM
4755 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004756#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4757
4758 // Multiply by the weight of matrix-matrix product and store the result
4759#if defined(ALPHA)
4760 SCALE_BLOCK(4, float, c, ALPHA);
4761#endif // defined(ALPHA)
4762
4763 // Add beta*bias
4764#if defined(BETA)
4765 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4766
4767#if defined(BROADCAST_BIAS)
4768 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
4769
4770 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4771
4772#ifndef UNIT_BETA
4773 SCALE_BLOCK(1, float, bias, BETA);
4774#endif // UNIT_BIAS
4775
4776 // c = c + bias[broadcasted]
4777 ADD_BLOCK_BROADCAST(4, c, bias0);
4778
4779#else // defined(BROADCAST_BIAS)
4780 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
4781 2) * src2_stride_z;
4782
4783 LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
4784
4785#ifndef UNIT_BETA
4786 SCALE_BLOCK(4, float, bias, BETA);
4787#endif // UNIT_BIAS
4788
4789 // c = c + bias
4790 ADD_BLOCK(4, c, bias);
4791
4792#endif // defined(BROADCAST_BIAS)
4793#endif // defined(BETA)
4794
4795#if defined(ACTIVATION_TYPE)
4796 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
4797#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00004798
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004799 // Store 4x4 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004800 vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
4801 vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
4802 vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
4803 vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004804}
4805
Georgios Pinitas84225582018-05-14 12:00:05 +01004806// Undefine local defines
4807#undef COLS_MTX_B
4808
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01004809#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004810/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00004811 *
Gian Marco19835e52018-01-30 13:35:54 +00004812 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004813 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
4814 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
4815 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
4816 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004817 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004818 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
4819 * The activation function is performed after the bias addition
4820 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004821 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4822 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4823 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4824 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
4825 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004826 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
4827 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
4828 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4829 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
4830 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4831 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004832 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004833 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
4834 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4835 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
4836 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
4837 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004838 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4839 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4840 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
4841 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4842 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
4843 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004844 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004845 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004846 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004847 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco36a0a462018-01-12 10:21:40 +00004848 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004849 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004850 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
4851 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004852 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004853 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01004854 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004855 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01004856__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
4857 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004858#if defined(BETA)
4859 IMAGE_DECLARATION(src2),
4860#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00004861 IMAGE_DECLARATION(dst),
4862 uint src0_stride_z,
4863 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004864#if defined(BETA)
4865 uint src2_stride_z,
4866#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004867 uint dst_stride_z
4868#if defined(REINTERPRET_OUTPUT_AS_3D)
4869 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004870 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004871#endif // REINTERPRET_OUTPUT_AS_3D
4872 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004873{
Gian Marco36a0a462018-01-12 10:21:40 +00004874 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
4875 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marcoae2af742018-02-15 12:35:44 +00004876 int z = get_global_id(2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004877
Gian Marco36a0a462018-01-12 10:21:40 +00004878 // Offset
4879 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
4880 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004881
Gian Marco36a0a462018-01-12 10:21:40 +00004882 // src_addr_a = address of matrix A
4883 // src_addr_b = address of matrix B
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00004884 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
4885 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
4886
4887#if defined(MATRIX_B_DEPTH)
4888 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4889 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
4890#else // defined(MATRIX_B_DEPTH)
4891 src1_addr_in_bytes += z * src1_stride_z;
4892#endif // defined(MATRIX_B_DEPTH)
4893
4894 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
4895 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004896
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004897 // Compute end row address for matrix B
Gian Marco36a0a462018-01-12 10:21:40 +00004898 __global half *src_end_addr_b = src_addr_b + COLS_B;
4899
4900 src_addr_a += offset_row_a;
4901 src_addr_b += offset_row_b;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004902
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004903 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004904 half8 c0 = 0.0f;
4905 half8 c1 = 0.0f;
4906 half8 c2 = 0.0f;
4907 half8 c3 = 0.0f;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004908
Gian Marco36a0a462018-01-12 10:21:40 +00004909 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004910 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004911 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004912 half4 a0 = vload4(0, src_addr_a);
4913 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004914
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004915 c0 += (half8)a0.s0 * b0;
4916 c1 += (half8)a0.s1 * b0;
4917 c2 += (half8)a0.s2 * b0;
4918 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004919
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004920 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004921 a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
4922 b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004923
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004924 c0 += (half8)a0.s0 * b0;
4925 c1 += (half8)a0.s1 * b0;
4926 c2 += (half8)a0.s2 * b0;
4927 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004928 }
4929
Gian Marco36a0a462018-01-12 10:21:40 +00004930 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004931 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004932 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco36a0a462018-01-12 10:21:40 +00004933 half4 a0 = vload4(0, src_addr_a);
4934 half8 b0 = vload8(0, src_addr_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004935
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004936 c0 += (half8)a0.s0 * b0;
4937 c1 += (half8)a0.s1 * b0;
4938 c2 += (half8)a0.s2 * b0;
4939 c3 += (half8)a0.s3 * b0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004940 }
4941
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004942 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004943 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
4944
Gian Marcoae2af742018-02-15 12:35:44 +00004945 // Compute dst address
4946 __global uchar *dst_addr = offset(&dst, 0, 0);
4947
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004948 uint4 zout = 0;
4949
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004950#if defined(REINTERPRET_OUTPUT_AS_3D)
4951 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004952 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004953 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004954 // | |
4955 // | plane0 |
4956 // | |
4957 // |__________________|
4958 // |******************|
4959 // | cross_plane_pad |
4960 // |******************|
4961 // | |
4962 // | plane1 |
4963 // | |
4964 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004965
4966 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004967 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
4968 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004969
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01004970 // Add offset due to the cross plane paddings
4971 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004972
4973 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4974 // multiply dst_stride_z by DEPTH_GEMM3D
4975 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004976#else // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marcoae2af742018-02-15 12:35:44 +00004977 // Add offset for batched GEMM
4978 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004979#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4980
4981 // Multiply by the weight of matrix-matrix product and store the result
4982#if defined(ALPHA)
4983 SCALE_BLOCK(4, half, c, ALPHA);
4984#endif // defined(ALPHA)
4985
4986 // Add beta*bias
4987#if defined(BETA)
4988 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
4989
4990#if defined(BROADCAST_BIAS)
4991 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
4992
4993 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
4994
4995#ifndef UNIT_BETA
4996 SCALE_BLOCK(1, half, bias, BETA);
4997#endif // UNIT_BIAS
4998
4999 // c = c + bias[broadcasted]
5000 ADD_BLOCK_BROADCAST(4, c, bias0);
5001
5002#else // defined(BROADCAST_BIAS)
5003
5004 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5005 2) * src2_stride_z;
5006
5007 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5008
5009#ifndef UNIT_BETA
5010 SCALE_BLOCK(4, half, bias, BETA);
5011#endif // UNIT_BIAS
5012
5013 // c = c + bias
5014 ADD_BLOCK(4, c, bias);
5015
5016#endif // defined(BROADCAST_BIAS)
5017#endif // defined(BETA)
5018
5019#if defined(ACTIVATION_TYPE)
5020 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
5021#endif // defined(ACTIVATION_TYPE)
Gian Marcoae2af742018-02-15 12:35:44 +00005022
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005023 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005024 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5025 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5026 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5027 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005028}
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005029
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005030/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005031 *
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005032 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005033 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5034 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5035 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5036 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005037 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005038 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5039 * The activation function is performed after the bias addition
5040 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005041 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5042 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5043 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5044 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5045 *
5046 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5047 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5048 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5049 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5050 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5051 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5052 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5053 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5054 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5055 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5056 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5057 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005058 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5059 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5060 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5061 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5062 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5063 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005064 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5065 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5066 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5067 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5068 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5069 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
5070 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5071 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005072 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005073 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
5074 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
5075 */
5076__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
5077 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005078#if defined(BETA)
5079 IMAGE_DECLARATION(src2),
5080#endif // defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005081 IMAGE_DECLARATION(dst),
5082 uint src0_stride_z,
5083 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005084#if defined(BETA)
5085 uint src2_stride_z,
5086#endif //defined(BETA)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005087 uint dst_stride_z
5088#if defined(REINTERPRET_OUTPUT_AS_3D)
5089 ,
5090 uint cross_plane_pad
5091#endif // REINTERPRET_OUTPUT_AS_3D
5092 )
5093{
5094 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5095 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5096 int z = get_global_id(2);
5097
5098 // Offset
5099 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5100 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5101
5102 // src_addr_a = address of matrix A
5103 // src_addr_b = address of matrix B
5104 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5105 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5106
5107#if defined(MATRIX_B_DEPTH)
5108 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5109 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5110#else // defined(MATRIX_B_DEPTH)
5111 src1_addr_in_bytes += z * src1_stride_z;
5112#endif // defined(MATRIX_B_DEPTH)
5113
5114 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5115 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5116
5117 // Compute end row address for matrix B
5118 __global half *src_end_addr_b = src_addr_b + COLS_B;
5119
5120 src_addr_a += offset_row_a;
5121 src_addr_b += offset_row_b;
5122
5123 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005124 float8 c0 = 0.0f;
5125 float8 c1 = 0.0f;
5126 float8 c2 = 0.0f;
5127 float8 c3 = 0.0f;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005128
5129 for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
5130 {
5131 // Load values from matrix A (interleaved) and matrix B (transposed)
5132 float4 a0 = convert_float4(vload4(0, src_addr_a));
5133 float8 b0 = convert_float8(vload8(0, src_addr_b));
5134
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005135 c0 += (float8)a0.s0 * b0;
5136 c1 += (float8)a0.s1 * b0;
5137 c2 += (float8)a0.s2 * b0;
5138 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005139
5140 // Load values from matrix A (interleaved) and matrix B (transposed)
5141 a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
5142 b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
5143
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005144 c0 += (float8)a0.s0 * b0;
5145 c1 += (float8)a0.s1 * b0;
5146 c2 += (float8)a0.s2 * b0;
5147 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005148 }
5149
5150 for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
5151 {
5152 // Load values from matrix A (interleaved) and matrix B (transposed)
5153 float4 a0 = convert_float4(vload4(0, src_addr_a));
5154 float8 b0 = convert_float8(vload8(0, src_addr_b));
5155
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005156 c0 += (float8)a0.s0 * b0;
5157 c1 += (float8)a0.s1 * b0;
5158 c2 += (float8)a0.s2 * b0;
5159 c3 += (float8)a0.s3 * b0;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005160 }
5161
5162 // Compute destination address
5163 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5164
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005165 // Compute dst address
5166 __global uchar *dst_addr = offset(&dst, 0, 0);
5167
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005168 uint4 zout = 0;
5169
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005170#if defined(REINTERPRET_OUTPUT_AS_3D)
5171 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
5172 // in order to take into account the presence of possible cross plane paddings
5173 //
5174 // | |
5175 // | plane0 |
5176 // | |
5177 // |__________________|
5178 // |******************|
5179 // | cross_plane_pad |
5180 // |******************|
5181 // | |
5182 // | plane1 |
5183 // | |
5184 // |__________________|
5185
5186 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005187 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5188 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005189
5190 // Add offset due to the cross plane paddings
5191 zout *= (cross_plane_pad * dst_stride_y);
5192
5193 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5194 // multiply dst_stride_z by DEPTH_GEMM3D
5195 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005196#else // defined(REINTERPRET_OUTPUT_AS_3D)
5197 // Add offset for batched GEMM
5198 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005199#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5200
5201 // Multiply by the weight of matrix-matrix product and store the result
5202#if defined(ALPHA)
5203 SCALE_BLOCK(4, float, c, ALPHA);
5204#endif // defined(ALPHA)
5205
5206#if defined(BETA)
5207 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5208
5209#if defined(BROADCAST_BIAS)
5210 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5211
5212 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5213
5214 float8 bias_f0 = convert_float8(bias0);
5215
5216#ifndef UNIT_BETA
5217 SCALE_BLOCK(1, float, bias_f, BETA);
5218#endif // UNIT_BIAS
5219
5220 // c = c + bias[broadcasted]
5221 ADD_BLOCK_BROADCAST(4, c, bias_f0);
5222
5223#else // defined(BROADCAST_BIAS)
5224 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5225 2) * src2_stride_z;
5226
5227 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5228
5229 float8 bias_f0 = convert_float8(bias0);
5230 float8 bias_f1 = convert_float8(bias1);
5231 float8 bias_f2 = convert_float8(bias2);
5232 float8 bias_f3 = convert_float8(bias3);
5233
5234#ifndef UNIT_BETA
5235 SCALE_BLOCK(4, float, bias_f, BETA);
5236#endif // UNIT_BIAS
5237
5238 // c = c + bias
5239 ADD_BLOCK(4, c, bias_f);
5240
5241#endif // defined(BROADCAST_BIAS)
5242#endif // defined(BETA)
5243
5244 half8 c_h0 = convert_half8(c0);
5245 half8 c_h1 = convert_half8(c1);
5246 half8 c_h2 = convert_half8(c2);
5247 half8 c_h3 = convert_half8(c3);
5248
5249#if defined(ACTIVATION_TYPE)
5250 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
5251#endif // defined(ACTIVATION_TYPE)
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005252
5253 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005254 vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5255 vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5256 vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5257 vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Vidhya Sudhan Loganathan38d93bd2018-11-20 15:38:13 +00005258}
5259
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005260/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005261 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005262 * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005263 * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
5264 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
5265 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5266 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005267 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005268 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5269 * The activation function is performed after the bias addition
5270 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005271 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5272 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5273 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5274 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5275 *
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005276 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
5277 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5278 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5279 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5280 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5281 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5282 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5283 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5284 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5285 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5286 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5287 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005288 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5289 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5290 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5291 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5292 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5293 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005294 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5295 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5296 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
5297 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5298 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
5299 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005300 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5301 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
5302 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005303 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005304 */
5305__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
5306 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005307#if defined(BETA)
5308 IMAGE_DECLARATION(src2),
5309#endif // defined(BETA)
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005310 IMAGE_DECLARATION(dst),
5311 uint src0_stride_z,
5312 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005313#if defined(BETA)
5314 uint src2_stride_z,
5315#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005316 uint dst_stride_z
5317#if defined(REINTERPRET_OUTPUT_AS_3D)
5318 ,
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005319 uint cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005320#endif // REINTERPRET_OUTPUT_AS_3D
5321 )
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005322{
5323 int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
5324 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
5325 int z = get_global_id(2);
5326
5327 // Offset
5328 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
5329 const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
5330
5331 // src_addr_a = address of matrix A
5332 // src_addr_b = address of matrix B
5333 int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
5334 int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
5335
5336#if defined(MATRIX_B_DEPTH)
5337 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5338 src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
5339#else // defined(MATRIX_B_DEPTH)
5340 src1_addr_in_bytes += z * src1_stride_z;
5341#endif // defined(MATRIX_B_DEPTH)
5342
5343 __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
5344 __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
5345
5346 // Compute end row address for matrix B
5347 __global half *src_end_addr_b = src_addr_b + COLS_B;
5348
5349 src_addr_a += offset_row_a;
5350 src_addr_b += offset_row_b;
5351
5352 // Reset accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005353 half8 c0 = 0.0f;
5354 half8 c1 = 0.0f;
5355 half8 c2 = 0.0f;
5356 half8 c3 = 0.0f;
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005357
5358#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
5359
5360 int i = 0;
5361 for(; i <= (int)(COLS_MTX_B - 4); i += 4)
5362 {
5363#if MULT_INTERLEAVE4X4_HEIGHT == 1
5364 // Load values from matrix A (interleaved) and matrix B (transposed)
5365 half8 a0 = vload8(0, src_addr_a);
5366 half8 b0 = vload8(0, src_addr_b);
5367
5368 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5369 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5370
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005371 c0 = fma((half8)a0.s0, b0, c0);
5372 c1 = fma((half8)a0.s1, b0, c1);
5373 c2 = fma((half8)a0.s2, b0, c2);
5374 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005375
5376 // Load values from matrix B (transposed)
5377 b0 = vload8(0, src_addr_b);
5378
5379 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5380
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005381 c0 = fma((half8)a0.s4, b0, c0);
5382 c1 = fma((half8)a0.s5, b0, c1);
5383 c2 = fma((half8)a0.s6, b0, c2);
5384 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005385
5386 // Load values from matrix A (interleaved) and matrix B (transposed)
5387 a0 = vload8(0, src_addr_a);
5388 b0 = vload8(0, src_addr_b);
5389
5390 src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
5391 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5392
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005393 c0 = fma((half8)a0.s0, b0, c0);
5394 c1 = fma((half8)a0.s1, b0, c1);
5395 c2 = fma((half8)a0.s2, b0, c2);
5396 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005397
5398 // Load values from matrix B (transposed)
5399 b0 = vload8(0, src_addr_b);
5400
5401 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5402
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005403 c0 = fma((half8)a0.s4, b0, c0);
5404 c1 = fma((half8)a0.s5, b0, c1);
5405 c2 = fma((half8)a0.s6, b0, c2);
5406 c3 = fma((half8)a0.s7, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005407#else // MULT_INTERLEAVE4X4_HEIGHT == 1
5408 // Load values from matrix A (interleaved) and matrix B (transposed)
5409 half4 a0 = vload4(0, src_addr_a);
5410 half8 b0 = vload8(0, src_addr_b);
5411
5412 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5413 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5414
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005415 c0 = fma((half8)a0.s0, b0, c0);
5416 c1 = fma((half8)a0.s1, b0, c1);
5417 c2 = fma((half8)a0.s2, b0, c2);
5418 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005419
5420 // Load values from matrix A (interleaved) and matrix B (transposed)
5421 a0 = vload4(0, src_addr_a);
5422 b0 = vload8(0, src_addr_b);
5423
5424 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5425 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5426
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005427 c0 = fma((half8)a0.s0, b0, c0);
5428 c1 = fma((half8)a0.s1, b0, c1);
5429 c2 = fma((half8)a0.s2, b0, c2);
5430 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005431
5432 // Load values from matrix A (interleaved) and matrix B (transposed)
5433 a0 = vload4(0, src_addr_a);
5434 b0 = vload8(0, src_addr_b);
5435
5436 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5437 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5438
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005439 c0 = fma((half8)a0.s0, b0, c0);
5440 c1 = fma((half8)a0.s1, b0, c1);
5441 c2 = fma((half8)a0.s2, b0, c2);
5442 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005443
5444 // Load values from matrix A (interleaved) and matrix B (transposed)
5445 a0 = vload4(0, src_addr_a);
5446 b0 = vload8(0, src_addr_b);
5447
5448 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5449 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5450
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005451 c0 = fma((half8)a0.s0, b0, c0);
5452 c1 = fma((half8)a0.s1, b0, c1);
5453 c2 = fma((half8)a0.s2, b0, c2);
5454 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005455#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
5456 }
5457
5458 for(; i < (int)(COLS_MTX_B); ++i)
5459 {
5460 // Load values from matrix A (interleaved) and matrix B (transposed)
5461 half4 a0 = vload4(0, src_addr_a);
5462 half8 b0 = vload8(0, src_addr_b);
5463
5464 src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
5465 src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
5466
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005467 c0 = fma((half8)a0.s0, b0, c0);
5468 c1 = fma((half8)a0.s1, b0, c1);
5469 c2 = fma((half8)a0.s2, b0, c2);
5470 c3 = fma((half8)a0.s3, b0, c3);
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005471 }
5472
5473 // Compute destination address
5474 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5475
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005476 // Compute dst address
5477 __global uchar *dst_addr = offset(&dst, 0, 0);
5478
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005479 uint4 zout = 0;
5480
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005481#if defined(REINTERPRET_OUTPUT_AS_3D)
5482 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005483 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005484 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005485 // | |
5486 // | plane0 |
5487 // | |
5488 // |__________________|
5489 // |******************|
5490 // | cross_plane_pad |
5491 // |******************|
5492 // | |
5493 // | plane1 |
5494 // | |
5495 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005496
5497 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005498 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
5499 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005500
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005501 // Add offset due to the cross plane paddings
5502 zout *= (cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005503
5504 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5505 // multiply dst_stride_z by DEPTH_GEMM3D
5506 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005507#else // defined(REINTERPRET_OUTPUT_AS_3D)
5508 // Add offset for batched GEMM
5509 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005510#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5511
5512 // Multiply by the weight of matrix-matrix product and store the result
5513#if defined(ALPHA)
5514 SCALE_BLOCK(4, half, c, ALPHA);
5515#endif // defined(ALPHA)
5516
5517 // Add beta*bias
5518#if defined(BETA)
5519 REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
5520
5521#if defined(BROADCAST_BIAS)
5522 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
5523
5524 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5525
5526#ifndef UNIT_BETA
5527 SCALE_BLOCK(1, half, bias, BETA);
5528#endif // UNIT_BIAS
5529
5530 // c = c + bias[broadcasted]
5531 ADD_BLOCK_BROADCAST(4, c, bias0);
5532
5533#else // defined(BROADCAST_BIAS)
5534 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
5535 2) * src2_stride_z;
5536
5537 LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
5538
5539#ifndef UNIT_BETA
5540 SCALE_BLOCK(4, half, bias, BETA);
5541#endif // UNIT_BIAS
5542
5543 // c = c + bias
5544 ADD_BLOCK(4, c, bias);
5545
5546#endif // defined(BROADCAST_BIAS)
5547#endif // defined(BETA)
5548
5549#if defined(ACTIVATION_TYPE)
5550 ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
5551#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005552
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005553 // Store 4x8 block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005554 vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
5555 vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
5556 vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
5557 vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
Gian Marco Iodicebb36a8e2018-04-19 12:05:08 +01005558}
Georgios Pinitas84225582018-05-14 12:00:05 +01005559
5560// Undefine local defines
5561#undef COLS_MTX_B
5562
Matthew Bentham6f31f8c2017-10-27 11:50:06 +01005563#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005564
Gian Marco36a0a462018-01-12 10:21:40 +00005565#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005566
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005567#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
5568#if defined(DATA_TYPE)
5569#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00005570/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
5571 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005572 * @note This OpenCL kernel works with floating point data types (F16/F32)
5573 * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
5574 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005575 * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005576 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5577 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005578 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005579 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5580 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005581 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5582 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005583 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5584 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5585 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5586 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5587 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005588 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005589 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5590 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5591 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5592 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5593 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005594 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005595 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5596 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5597 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5598 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5599 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005600 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5601 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5602 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5603 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5604 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5605 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005606 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005607 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5608 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5609 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5610 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5611 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005612 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5613 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005614 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005615 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005616 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5617 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005618 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005619__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
5620 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005621#if defined(BETA)
5622 IMAGE_DECLARATION(src2),
5623#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005624 IMAGE_DECLARATION(dst),
5625 uint src0_stride_z,
5626 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005627#if defined(BETA)
5628 uint src2_stride_z,
5629#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005630 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005631#if defined(REINTERPRET_INPUT_AS_3D)
5632 ,
5633 uint src_cross_plane_pad
5634#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005635#if defined(REINTERPRET_OUTPUT_AS_3D)
5636 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005637 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005638#endif // REINTERPRET_OUTPUT_AS_3D
5639 )
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005640{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005641 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005642
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005643 // Compute starting address for matrix A and Matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005644 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005645
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005646 // Update address for the matrix A
5647 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005648
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005649 // Update address for the matrix B
5650 src_addr.s1 += idx * sizeof(DATA_TYPE);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005651
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005652#if defined(REINTERPRET_INPUT_AS_3D)
5653 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
5654 // in order to take into account the presence of possible cross plane paddings
5655 //
5656 // | |
5657 // | plane0 |
5658 // | |
5659 // |__________________|
5660 // |******************|
5661 // | cross_plane_pad |
5662 // |******************|
5663 // | |
5664 // | plane1 |
5665 // | |
5666 // |__________________|
5667
5668 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
5669 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5670 zin = min(DEPTH_GEMM3D - 1, zin);
5671
5672 // Add offset due to the cross plane paddings
5673 zin *= (src_cross_plane_pad * src0_stride_y);
5674
5675 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5676 // multiply src0_stride_z by DEPTH_GEMM3D
5677 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
5678
5679#else // defined(REINTERPRET_INPUT_AS_3D)
5680
Gian Marcoae2af742018-02-15 12:35:44 +00005681 // Add offset for batched GEMM
5682 src_addr.s0 += get_global_id(2) * src0_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005683
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005684#endif // defined(REINTERPRET_INPUT_AS_3D)
5685
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005686#if defined(MATRIX_B_DEPTH)
5687 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
5688 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
5689#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005690 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005691#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00005692
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005693 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
5694
5695 VECTOR_TYPE acc0 = 0.0f;
5696#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5697 VECTOR_TYPE acc1 = 0.0f;
5698#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5699#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5700 VECTOR_TYPE acc2 = 0.0f;
5701#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5702#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5703 VECTOR_TYPE acc3 = 0.0f;
5704#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5705
Georgios Pinitas96880cf2017-10-20 18:52:20 +01005706 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005707 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005708#if defined(REINTERPRET_INPUT_AS_3D)
5709 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01005710 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
5711#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005712 // Load values from matrix A
5713 VEC_DATA_TYPE(DATA_TYPE, 2)
5714 a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5715#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5716 VEC_DATA_TYPE(DATA_TYPE, 2)
5717 a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5718#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5719#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5720 VEC_DATA_TYPE(DATA_TYPE, 2)
5721 a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5722#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5723#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5724 VEC_DATA_TYPE(DATA_TYPE, 2)
5725 a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5726#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005727#endif // defined(REINTERPRET_INPUT_AS_3D)
5728
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005729 // Load values from matrix B
5730 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
5731 VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005732
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005733 // Accumulate
5734 acc0 += b0 * (VECTOR_TYPE)a0.s0;
5735 acc0 += b1 * (VECTOR_TYPE)a0.s1;
5736#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5737 acc1 += b0 * (VECTOR_TYPE)a1.s0;
5738 acc1 += b1 * (VECTOR_TYPE)a1.s1;
5739#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5740#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5741 acc2 += b0 * (VECTOR_TYPE)a2.s0;
5742 acc2 += b1 * (VECTOR_TYPE)a2.s1;
5743#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5745 acc3 += b0 * (VECTOR_TYPE)a3.s0;
5746 acc3 += b1 * (VECTOR_TYPE)a3.s1;
5747#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005748 }
5749
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005750 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005751 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005752#if defined(REINTERPRET_INPUT_AS_3D)
5753 // Load values from matrix A
5754 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
5755#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5756 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
5757#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5758#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5759 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
5760#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5761#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5762 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
5763#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5764#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005765 // Load values from matrix A
5766 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
5767#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5768 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
5769#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5770#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5771 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
5772#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5773#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5774 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
5775#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005776#endif // defined(REINTERPRET_INPUT_AS_3D)
5777
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005778 // Load values from matrix B
5779 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005780
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005781 // Accumulate
5782 acc0 += b0 * (VECTOR_TYPE)a0;
5783#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5784 acc1 += b0 * (VECTOR_TYPE)a1;
5785#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
5786#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5787 acc2 += b0 * (VECTOR_TYPE)a2;
5788#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
5789#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
5790 acc3 += b0 * (VECTOR_TYPE)a3;
5791#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005792 }
5793
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005794 int z = get_global_id(2);
5795
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005796 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005797 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
5798
Gian Marcoae2af742018-02-15 12:35:44 +00005799 // Compute dst address
5800 __global uchar *dst_addr = offset(&dst, 0, 0);
5801
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005802 uint4 zout = 0;
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005803
5804#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005805
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005806 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005807 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005808 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005809 // | |
5810 // | plane0 |
5811 // | |
5812 // |__________________|
5813 // |******************|
5814 // | cross_plane_pad |
5815 // |******************|
5816 // | |
5817 // | plane1 |
5818 // | |
5819 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005820
5821 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005822 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5823 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005824
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01005825 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005826 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005827
5828 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5829 // multiply dst_stride_z by DEPTH_GEMM3D
5830 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005831#else // defined(REINTERPRET_OUTPUT_AS_3D)
5832 // Add offset for batched GEMM
5833 dst_addr += z * dst_stride_z;
5834#endif // defined(REINTERPRET_OUTPUT_AS_3D)
5835
5836 // Multiply by the weight of matrix-matrix product and store the result
5837#if defined(ALPHA)
5838 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
5839#endif // defined(ALPHA)
5840
5841 // Add beta*bias
5842#if defined(BETA)
5843 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
5844
5845#if defined(BROADCAST_BIAS)
5846 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
5847
5848 LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5849
5850#ifndef UNIT_BETA
5851 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5852#endif // UNIT_BIAS
5853
5854 // c = c + bias[broadcasted]
5855 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
5856
5857#else // defined(BROADCAST_BIAS)
5858 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *
5859 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
5860
5861 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
5862
5863#ifndef UNIT_BETA
5864 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
5865#endif // UNIT_BIAS
5866
5867 // c = c + bias
5868 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
5869
5870#endif // defined(BROADCAST_BIAS)
5871#endif // defined(BETA)
5872
5873#if defined(ACTIVATION_TYPE)
5874 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
5875#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005876
5877 // Store output block
Usama Arif0681e3b2019-04-25 14:28:07 +01005878 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01005879}
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01005880#endif // defined(DATA_TYPE)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01005881
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +01005882/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005883 *
5884 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
5885 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
5886 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
5887 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
5888 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005889 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
5890 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005891 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005892 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
5893 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005894 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
5895 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005896 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
5897 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
5898 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
5899 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
5900 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005901 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005902 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
5903 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5904 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
5905 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5906 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
5907 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
5908 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
5909 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
5910 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
5911 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
5912 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005913 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
5914 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
5915 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
5916 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
5917 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
5918 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005919 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
5920 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
5921 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
5922 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
5923 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
5924 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005925 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
5926 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005927 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005928 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005929 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
5930 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005931 */
5932__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
5933 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005934#if defined(BETA)
5935 IMAGE_DECLARATION(src2),
5936#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00005937 IMAGE_DECLARATION(dst),
5938 uint src0_stride_z,
5939 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01005940#if defined(BETA)
5941 uint src2_stride_z,
5942#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005943 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005944#if defined(REINTERPRET_INPUT_AS_3D)
5945 ,
5946 uint src_cross_plane_pad
5947#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005948#if defined(REINTERPRET_OUTPUT_AS_3D)
5949 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005950 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00005951#endif // REINTERPRET_OUTPUT_AS_3D
5952 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00005953{
5954 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
5955
5956 // Compute starting address for matrix A and matrix B
5957 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
5958
5959 // Update address for matrix A
5960 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
5961
5962 // Update address for matrix B
5963 src_addr.s1 += idx * sizeof(float);
5964
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005965#if defined(REINTERPRET_INPUT_AS_3D)
5966 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
5967 // in order to take into account the presence of possible cross plane paddings
5968 //
5969 // | |
5970 // | plane0 |
5971 // | |
5972 // |__________________|
5973 // |******************|
5974 // | cross_plane_pad |
5975 // |******************|
5976 // | |
5977 // | plane1 |
5978 // | |
5979 // |__________________|
5980
5981 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
5982 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
5983 zin = min(DEPTH_GEMM3D - 1, zin);
5984
5985 // Add offset due to the cross plane paddings
5986 zin *= (src_cross_plane_pad * src0_stride_y);
5987
5988 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
5989 // multiply src0_stride_z by DEPTH_GEMM3D
5990 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
5991
5992#else // defined(REINTERPRET_INPUT_AS_3D)
5993
Gian Marcoae2af742018-02-15 12:35:44 +00005994 // Add offset for batched GEMM
5995 src_addr.s0 += get_global_id(2) * src0_stride_z;
5996
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01005997#endif // defined(REINTERPRET_INPUT_AS_3D)
5998
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00005999#if defined(MATRIX_B_DEPTH)
6000 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6001 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6002#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006003 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006004#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006005
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006006 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006007 float4 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006008
6009#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006010 float4 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006011#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6012
6013#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006014 float4 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006015#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6016
6017#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006018 float4 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006019#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6020
6021 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006022 int i = 0;
6023 for(; i <= ((int)COLS_A - 4); i += 4)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006024 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006025#if defined(REINTERPRET_INPUT_AS_3D)
6026 // Load values from matrix A and matrix B
Usama Arif0681e3b2019-04-25 14:28:07 +01006027 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6028#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006029 // Load values from matrix A and matrix B
6030 float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006031#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006032 float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006033#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6034#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006035 float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006036#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6037#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006038 float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006039#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006040#endif // defined(REINTERPRET_INPUT_AS_3D)
6041
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006042 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6043 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006044
6045 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006046 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6047 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6048 acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
6049 acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006050
6051#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006052
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006053 acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
6054 acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
6055 acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
6056 acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006057
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006058#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6059#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006060
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006061 acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
6062 acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
6063 acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
6064 acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006065
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006066#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6067#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006068
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006069 acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
6070 acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
6071 acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
6072 acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006073#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006074
6075 // Load values from matrix A and matrix B
6076 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6077 src_addr.s1 += src1_stride_y;
6078
6079 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006080 acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
6081 acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
6082 acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
6083 acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006084
6085#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6086
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006087 acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
6088 acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
6089 acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
6090 acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006091
6092#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6093#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6094
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006095 acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
6096 acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
6097 acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
6098 acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006099
6100#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6101#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6102
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006103 acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
6104 acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
6105 acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
6106 acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006107#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6108
6109 // Load values from matrix A and matrix B
6110 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6111 src_addr.s1 += src1_stride_y;
6112
6113 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006114 acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
6115 acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
6116 acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
6117 acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006118
6119#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6120
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006121 acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
6122 acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
6123 acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
6124 acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006125
6126#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6127#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6128
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006129 acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
6130 acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
6131 acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
6132 acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006133
6134#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6135#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6136
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006137 acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
6138 acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
6139 acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
6140 acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006141#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6142
6143 // Load values from matrix A and matrix B
6144 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
6145 src_addr.s1 += src1_stride_y;
6146
6147 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006148 acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
6149 acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
6150 acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
6151 acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006152
6153#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6154
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006155 acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
6156 acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
6157 acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
6158 acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006159
6160#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6161#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6162
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006163 acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
6164 acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
6165 acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
6166 acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006167
6168#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6169#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6170
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006171 acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
6172 acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
6173 acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
6174 acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006175#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6176
6177 src_addr.s0 += 4 * sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006178 }
6179
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006180 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006181 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006182#if defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006183 // Load values from matrix A
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006184 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6185#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6186 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6187#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6188#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6189 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6190#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6191#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6192 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6193#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6194#else // defined(REINTERPRET_INPUT_AS_3D)
6195 // Load values from matrix A
6196 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006197#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6198 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6199#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6200#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6201 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6202#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6203#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6204 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6205#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006206#endif // defined(REINTERPRET_INPUT_AS_3D)
6207
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006208 // Load values from matrix B
6209 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006210 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006211
6212 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006213 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6214 acc0.s1 = fma(a0, b0.s1, acc0.s1);
6215 acc0.s2 = fma(a0, b0.s2, acc0.s2);
6216 acc0.s3 = fma(a0, b0.s3, acc0.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006217#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006218 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6219 acc1.s1 = fma(a1, b0.s1, acc1.s1);
6220 acc1.s2 = fma(a1, b0.s2, acc1.s2);
6221 acc1.s3 = fma(a1, b0.s3, acc1.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006222#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6223#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006224 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6225 acc2.s1 = fma(a2, b0.s1, acc2.s1);
6226 acc2.s2 = fma(a2, b0.s2, acc2.s2);
6227 acc2.s3 = fma(a2, b0.s3, acc2.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006228#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6229#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006230 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6231 acc3.s1 = fma(a3, b0.s1, acc3.s1);
6232 acc3.s2 = fma(a3, b0.s2, acc3.s2);
6233 acc3.s3 = fma(a3, b0.s3, acc3.s3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006234#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006235
6236 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006237 }
6238
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006239 int z = get_global_id(2);
6240
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006241 // Compute destination address
6242 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6243
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006244 // Compute dst address
6245 __global uchar *dst_addr = offset(&dst, 0, 0);
6246
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006247 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006248
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006249#if defined(REINTERPRET_OUTPUT_AS_3D)
6250 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006251 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006252 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006253 // | |
6254 // | plane0 |
6255 // | |
6256 // |__________________|
6257 // |******************|
6258 // | cross_plane_pad |
6259 // |******************|
6260 // | |
6261 // | plane1 |
6262 // | |
6263 // |__________________|
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006264
6265 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006266 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6267 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006268
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006269 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006270 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006271
6272 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6273 // multiply dst_stride_z by DEPTH_GEMM3D
6274 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006275#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006276 // Add offset for batched GEMM
6277 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006278#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6279
6280 // Multiply by the weight of matrix-matrix product and store the result
6281#if defined(ALPHA)
6282 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6283#endif // defined(ALPHA)
6284
6285 // Add beta*bias
6286#if defined(BETA)
6287 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6288
6289#if defined(BROADCAST_BIAS)
6290 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
6291
6292 LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6293
6294#ifndef UNIT_BETA
6295 SCALE_BLOCK(1, float, bias, BETA);
6296#endif // UNIT_BIAS
6297
6298 // acc = acc + bias[broadcasted]
6299 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6300
6301#else // defined(BROADCAST_BIAS)
6302 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *
6303 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6304
6305 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
6306
6307#ifndef UNIT_BETA
6308 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6309#endif // UNIT_BIAS
6310
6311 // acc = acc + bias
6312 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6313
6314#endif // defined(BROADCAST_BIAS)
6315#endif // defined(BETA)
6316
6317#if defined(ACTIVATION_TYPE)
6318 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
6319#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006320
6321 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006322 vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006323#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006324 vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006325#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6326#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006327 vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006328#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6329#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006330 vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006331#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006332}
6333
6334/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
6335 *
6336 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
6337 * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
6338 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6339 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
6340 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6341 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006342 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6343 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006344 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006345 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6346 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006347 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6348 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006349 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6350 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6351 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6352 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6353 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006354 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006355 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6356 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6357 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6358 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6359 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6360 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6361 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6362 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6363 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6364 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6365 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006366 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6367 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6368 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6369 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6370 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6371 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006372 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6373 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6374 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6375 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6376 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6377 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006378 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6379 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006380 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006381 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006382 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6383 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006384 */
6385__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
6386 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006387#if defined(BETA)
6388 IMAGE_DECLARATION(src2),
6389#endif // defined(BETA)
Gian Marcoae2af742018-02-15 12:35:44 +00006390 IMAGE_DECLARATION(dst),
6391 uint src0_stride_z,
6392 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006393#if defined(BETA)
6394 uint src2_stride_z,
6395#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006396 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006397#if defined(REINTERPRET_INPUT_AS_3D)
6398 ,
6399 uint src_cross_plane_pad
6400#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006401#if defined(REINTERPRET_OUTPUT_AS_3D)
6402 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006403 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006404#endif // REINTERPRET_OUTPUT_AS_3D
6405 )
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006406{
6407 // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6408 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6409
6410 // Compute starting address for matrix A and Matrix B
6411 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6412
6413 // Update address for the matrix A
6414 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6415
6416 // Update address for the matrix B
6417 src_addr.s1 += idx * sizeof(float);
6418
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006419#if defined(REINTERPRET_INPUT_AS_3D)
6420 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6421 // in order to take into account the presence of possible cross plane paddings
6422 //
6423 // | |
6424 // | plane0 |
6425 // | |
6426 // |__________________|
6427 // |******************|
6428 // | cross_plane_pad |
6429 // |******************|
6430 // | |
6431 // | plane1 |
6432 // | |
6433 // |__________________|
6434
6435 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6436 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6437 zin = min(DEPTH_GEMM3D - 1, zin);
6438
6439 // Add offset due to the cross plane paddings
6440 zin *= (src_cross_plane_pad * src0_stride_y);
6441
6442 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6443 // multiply src0_stride_z by DEPTH_GEMM3D
6444 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6445
6446#else // defined(REINTERPRET_INPUT_AS_3D)
6447
Gian Marcoae2af742018-02-15 12:35:44 +00006448 // Add offset for batched GEMM
6449 src_addr.s0 += get_global_id(2) * src0_stride_z;
6450
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006451#endif // defined(REINTERPRET_INPUT_AS_3D)
6452
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006453#if defined(MATRIX_B_DEPTH)
6454 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6455 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6456#else // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006457 src_addr.s1 += get_global_id(2) * src1_stride_z;
Gian Marco Iodiced2fab732018-03-02 11:18:12 +00006458#endif // defined(MATRIX_B_DEPTH)
Gian Marcoae2af742018-02-15 12:35:44 +00006459
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006460 // Initialize accumulators
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006461 float2 acc0 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006462#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006463 float2 acc1 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006464#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6465#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006466 float2 acc2 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006467#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6468#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006469 float2 acc3 = 0.0f;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006470#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6471
6472 // A and B src indices get incremented at the same time.
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006473 int i = 0;
6474 for(; i <= ((int)COLS_A - 8); i += 8)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006475 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006476#if defined(REINTERPRET_INPUT_AS_3D)
6477 // Load values from matrix A
6478 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
6479#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006480 // Load values from matrix A
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006481 float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006482#endif // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006483
6484 // Load values from matrix B
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006485 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6486 src_addr.s1 += src1_stride_y;
6487 float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6488 src_addr.s1 += src1_stride_y;
6489 float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6490 src_addr.s1 += src1_stride_y;
6491 float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6492 src_addr.s1 += src1_stride_y;
6493 float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6494 src_addr.s1 += src1_stride_y;
6495 float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6496 src_addr.s1 += src1_stride_y;
6497 float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6498 src_addr.s1 += src1_stride_y;
6499 float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
6500 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006501
6502 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006503 acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
6504 acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
6505 acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
6506 acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
6507 acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
6508 acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
6509 acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
6510 acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006511
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006512 acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
6513 acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
6514 acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
6515 acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
6516 acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
6517 acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
6518 acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
6519 acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006520
6521#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006522#if defined(REINTERPRET_INPUT_AS_3D)
6523 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6524#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006525 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006526#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006527 acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
6528 acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
6529 acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
6530 acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
6531 acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
6532 acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
6533 acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
6534 acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006535
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006536 acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
6537 acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
6538 acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
6539 acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
6540 acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
6541 acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
6542 acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
6543 acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006544#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6545#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006546#if defined(REINTERPRET_INPUT_AS_3D)
6547 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6548#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006549 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006550#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006551 acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
6552 acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
6553 acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
6554 acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
6555 acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
6556 acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
6557 acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
6558 acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006559
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006560 acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
6561 acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
6562 acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
6563 acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
6564 acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
6565 acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
6566 acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
6567 acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006568#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6569#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006570#if defined(REINTERPRET_INPUT_AS_3D)
6571 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6572#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006573 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006574#endif // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006575 acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
6576 acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
6577 acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
6578 acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
6579 acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
6580 acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
6581 acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
6582 acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006583
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006584 acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
6585 acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
6586 acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
6587 acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
6588 acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
6589 acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
6590 acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
6591 acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006592#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006593
6594 src_addr.s0 += sizeof(float) * 8;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006595 }
6596 // float size increment
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006597 for(; i < (int)COLS_A; ++i)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006598 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006599#if defined(REINTERPRET_INPUT_AS_3D)
6600 // Load values from matrix A
6601 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6602#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6603 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6604#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6605#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6606 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6607#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6608#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6609 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6610#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6611#else // defined(REINTERPRET_INPUT_AS_3D)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006612 // Load values from matrix A
6613 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6614#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6615 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6616#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6617#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6618 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6619#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6620#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6621 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6622#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006623#endif // defined(REINTERPRET_INPUT_AS_3D)
6624
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006625 // Load values from matrix B
6626 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006627 src_addr.s1 += src1_stride_y;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006628
6629 // Multiply and accumulate
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006630 acc0.s0 = fma(a0, b0.s0, acc0.s0);
6631 acc0.s1 = fma(a0, b0.s1, acc0.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006632#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006633 acc1.s0 = fma(a1, b0.s0, acc1.s0);
6634 acc1.s1 = fma(a1, b0.s1, acc1.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006635#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6636#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006637 acc2.s0 = fma(a2, b0.s0, acc2.s0);
6638 acc2.s1 = fma(a2, b0.s1, acc2.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006639#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6640#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006641 acc3.s0 = fma(a3, b0.s0, acc3.s0);
6642 acc3.s1 = fma(a3, b0.s1, acc3.s1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006643#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodicec9c62c22018-04-06 10:00:10 +01006644
6645 src_addr.s0 += sizeof(float);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006646 }
6647
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006648 int z = get_global_id(2);
6649
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006650 // Compute destination address
6651 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
6652
Gian Marcoae2af742018-02-15 12:35:44 +00006653 // Compute dst address
6654 __global uchar *dst_addr = offset(&dst, 0, 0);
6655
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006656 uint4 zout = 0;
Michele Di Giorgioebc3a902018-11-16 16:04:25 +00006657
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006658#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006659
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006660 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006661 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006662 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006663 // | |
6664 // | plane0 |
6665 // | |
6666 // |__________________|
6667 // |******************|
6668 // | cross_plane_pad |
6669 // |******************|
6670 // | |
6671 // | plane1 |
6672 // | |
6673 // |__________________|
Gian Marcoae2af742018-02-15 12:35:44 +00006674
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006675 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006676 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6677 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006678
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01006679 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01006680 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006681
6682 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6683 // multiply dst_stride_z by DEPTH_GEMM3D
6684 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006685#else // defined(REINTERPRET_OUTPUT_AS_3D)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006686 // Add offset for batched GEMM
6687 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006688#endif // defined(REINTERPRET_OUTPUT_AS_3D)
6689
6690 // Multiply by the weight of matrix-matrix product and store the result
6691#if defined(ALPHA)
6692 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
6693#endif // defined(ALPHA)
6694
6695 // Add beta*bias
6696#if defined(BETA)
6697 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
6698
6699#if defined(BROADCAST_BIAS)
6700 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
6701
6702 LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6703
6704#ifndef UNIT_BETA
6705 SCALE_BLOCK(1, float, bias, BETA);
6706#endif // UNIT_BIAS
6707
6708 // acc = acc + bias[broadcasted]
6709 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
6710
6711#else // defined(BROADCAST_BIAS)
6712 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *
6713 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
6714
6715 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
6716
6717#ifndef UNIT_BETA
6718 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
6719#endif // UNIT_BIAS
6720
6721 // acc = acc + bias
6722 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
6723
6724#endif // defined(BROADCAST_BIAS)
6725#endif // defined(BETA)
6726
6727#if defined(ACTIVATION_TYPE)
6728 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
6729#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006730
6731 // Store the output block
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006732 vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006733#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006734 vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006735#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6736#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006737 vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006738#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6739#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006740 vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
Isabella Gottardi8e74f442018-03-01 16:42:00 +00006741#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00006742}
6743
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01006744#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01006745/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
6746 *
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006747 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
6748 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
6749 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
6750 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
6751 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006752 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
6753 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006754 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006755 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
6756 * The activation function is performed after the bias addition
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006757 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
6758 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
6759 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
6760 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
6761 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
6762 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
6763 *
6764 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
6765 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
6766 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6767 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
6768 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6769 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
6770 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
6771 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
6772 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
6773 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
6774 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
6775 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006776 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
6777 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
6778 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
6779 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
6780 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
6781 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006782 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
6783 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
6784 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
6785 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
6786 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
6787 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
6788 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
6789 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006790 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006791 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
6792 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
6793 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
6794 */
6795__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
6796 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006797#if defined(BETA)
6798 IMAGE_DECLARATION(src2),
6799#endif // defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006800 IMAGE_DECLARATION(dst),
6801 uint src0_stride_z,
6802 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01006803#if defined(BETA)
6804 uint src2_stride_z,
6805#endif //defined(BETA)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006806 uint dst_stride_z
6807#if defined(REINTERPRET_INPUT_AS_3D)
6808 ,
6809 uint src_cross_plane_pad
6810#endif // REINTERPRET_INPUT_AS_3D
6811#if defined(REINTERPRET_OUTPUT_AS_3D)
6812 ,
6813 uint dst_cross_plane_pad
6814#endif // REINTERPRET_OUTPUT_AS_3D
6815 )
6816{
6817 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
6818
6819 // Compute starting address for matrix A and Matrix B
6820 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
6821
6822 // Update address for the matrix A
6823 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
6824
6825 // Update address for the matrix B
6826 src_addr.s1 += idx * sizeof(half);
6827
6828#if defined(REINTERPRET_INPUT_AS_3D)
6829 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
6830 // in order to take into account the presence of possible cross plane paddings
6831 //
6832 // | |
6833 // | plane0 |
6834 // | |
6835 // |__________________|
6836 // |******************|
6837 // | cross_plane_pad |
6838 // |******************|
6839 // | |
6840 // | plane1 |
6841 // | |
6842 // |__________________|
6843
6844 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
6845 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
6846 zin = min(DEPTH_GEMM3D - 1, zin);
6847
6848 // Add offset due to the cross plane paddings
6849 zin *= (src_cross_plane_pad * src0_stride_y);
6850
6851 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
6852 // multiply src0_stride_z by DEPTH_GEMM3D
6853 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
6854
6855#else // defined(REINTERPRET_INPUT_AS_3D)
6856
6857 // Add offset for batched GEMM
6858 src_addr.s0 += get_global_id(2) * src0_stride_z;
6859
6860#endif // defined(REINTERPRET_INPUT_AS_3D)
6861
6862#if defined(MATRIX_B_DEPTH)
6863 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
6864 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
6865#else // defined(MATRIX_B_DEPTH)
6866 src_addr.s1 += get_global_id(2) * src1_stride_z;
6867#endif // defined(MATRIX_B_DEPTH)
6868
6869 float8 acc0 = 0.0h;
6870#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6871 float8 acc1 = 0.0h;
6872#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6873#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6874 float8 acc2 = 0.0h;
6875#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6876#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6877 float8 acc3 = 0.0h;
6878#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6879
6880 int i = 0;
6881 for(; i <= ((int)COLS_A - 4); i += 4)
6882 {
6883#if defined(REINTERPRET_INPUT_AS_3D)
6884 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01006885 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
6886#else // defined(REINTERPRET_INPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00006887 // Load values from matrix A
6888 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6889#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6890 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6891#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6892#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6893 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6894#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6895#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6896 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6897#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6898#endif // defined(REINTERPRET_INPUT_AS_3D)
6899
6900 // Load values from matrix B
6901 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6902 src_addr.s1 += src1_stride_y;
6903
6904 // Accumulate
6905 acc0 = fma(b0, (float8)a0.s0, acc0);
6906#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6907 acc1 = fma(b0, (float8)a1.s0, acc1);
6908#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6909#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6910 acc2 = fma(b0, (float8)a2.s0, acc2);
6911#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6912#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6913 acc3 = fma(b0, (float8)a3.s0, acc3);
6914#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6915
6916 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6917 src_addr.s1 += src1_stride_y;
6918 acc0 = fma(b0, (float8)a0.s1, acc0);
6919#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6920 acc1 = fma(b0, (float8)a1.s1, acc1);
6921#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6922#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6923 acc2 = fma(b0, (float8)a2.s1, acc2);
6924#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6925#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6926 acc3 = fma(b0, (float8)a3.s1, acc3);
6927#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6928
6929 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6930 src_addr.s1 += src1_stride_y;
6931 acc0 = fma(b0, (float8)a0.s2, acc0);
6932#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6933 acc1 = fma(b0, (float8)a1.s2, acc1);
6934#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6935#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6936 acc2 = fma(b0, (float8)a2.s2, acc2);
6937#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6938#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6939 acc3 = fma(b0, (float8)a3.s2, acc3);
6940#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6941
6942 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6943 src_addr.s1 += src1_stride_y;
6944 acc0 = fma(b0, (float8)a0.s3, acc0);
6945#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6946 acc1 = fma(b0, (float8)a1.s3, acc1);
6947#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6948#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6949 acc2 = fma(b0, (float8)a2.s3, acc2);
6950#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6951#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6952 acc3 = fma(b0, (float8)a3.s3, acc3);
6953#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6954
6955 src_addr.s0 += 4 * sizeof(half);
6956 }
6957
6958 for(; i < (int)COLS_A; ++i)
6959 {
6960#if defined(REINTERPRET_INPUT_AS_3D)
6961 // Load values from matrix A
6962 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
6963#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6964 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
6965#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6966#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6967 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
6968#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6969#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6970 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
6971#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6972#else // defined(REINTERPRET_INPUT_AS_3D)
6973 // Load values from matrix A
6974 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
6975#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6976 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
6977#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6978#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6979 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
6980#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6981#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6982 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
6983#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
6984#endif // defined(REINTERPRET_INPUT_AS_3D)
6985
6986 // Load values from matrix B
6987 float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
6988
6989 src_addr += (int2)(sizeof(half), src1_stride_y);
6990
6991 // Accumulate
6992 acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
6993#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6994 acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
6995#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
6996#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6997 acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
6998#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
6999#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7000 acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
7001#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7002 }
7003
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007004 int z = get_global_id(2);
7005
7006 // Compute destination address
7007 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7008
7009 // Compute dst address
7010 __global uchar *dst_addr = offset(&dst, 0, 0);
7011
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007012 uint4 zout = 0;
7013
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007014#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007015
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007016 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
7017 // in order to take into account the presence of possible cross plane paddings
7018 //
7019 // | |
7020 // | plane0 |
7021 // | |
7022 // |__________________|
7023 // |******************|
7024 // | cross_plane_pad |
7025 // |******************|
7026 // | |
7027 // | plane1 |
7028 // | |
7029 // |__________________|
7030
7031 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007032 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7033 zout = min(DEPTH_GEMM3D - 1, zout);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007034
7035 // Add offset due to the cross plane paddings
7036 zout *= (dst_cross_plane_pad * dst_stride_y);
7037
7038 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7039 // multiply dst_stride_z by DEPTH_GEMM3D
7040 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007041#else // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007042 // Add offset for batched GEMM
7043 dst_addr += z * dst_stride_z;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007044#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007045
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007046 // Multiply by the weight of matrix-matrix product and store the result
7047#if defined(ALPHA)
7048 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
7049#endif // defined(ALPHA)
7050
7051#if defined(BETA)
7052 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7053
7054#if defined(BROADCAST_BIAS)
7055 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7056
7057 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7058
7059 float8 bias_f0 = convert_float8(bias0);
7060
7061#ifndef UNIT_BETA
7062 SCALE_BLOCK(1, float, bias_f, BETA);
7063#endif // UNIT_BIAS
7064
7065 // acc = acc + bias[broadcasted]
7066 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
7067
7068#else // defined(BROADCAST_BIAS)
7069 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7070 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7071
7072 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7073
7074 float8 bias_f0 = convert_float8(bias0);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007075#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007076 float8 bias_f1 = convert_float8(bias1);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007077#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7078#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007079 float8 bias_f2 = convert_float8(bias2);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007080#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7081#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007082 float8 bias_f3 = convert_float8(bias3);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007083#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007084
7085#ifndef UNIT_BETA
7086 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
7087#endif // UNIT_BIAS
7088
7089 // acc = acc + bias
7090 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
7091
7092#endif // defined(BROADCAST_BIAS)
7093#endif // defined(BETA)
7094
7095 half8 acc_h0 = convert_half8(acc0);
7096#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7097 half8 acc_h1 = convert_half8(acc1);
7098#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7099#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7100 half8 acc_h2 = convert_half8(acc2);
7101#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7102#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7103 half8 acc_h3 = convert_half8(acc3);
7104#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7105
7106#if defined(ACTIVATION_TYPE)
7107 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
7108#endif // defined(ACTIVATION_TYPE)
7109
7110 // Store the output block
7111 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
Vidhya Sudhan Loganathana25d16c2018-11-16 11:33:12 +00007112}
7113
7114/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
7115 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007116 * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
7117 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
7118 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
7119 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
7120 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007121 * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
7122 * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007123 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007124 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
7125 * The activation function is performed after the bias addition
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007126 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
7127 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007128 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
7129 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
7130 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
7131 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
7132 *
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007133 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
7134 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7135 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7136 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7137 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7138 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
7139 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
7140 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7141 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7142 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7143 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7144 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007145 * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
7146 * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
7147 * @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
7148 * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
7149 * @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
7150 * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007151 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
7152 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7153 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7154 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7155 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7156 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007157 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
7158 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007159 * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007160 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007161 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
7162 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007163 */
7164__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
7165 IMAGE_DECLARATION(src1),
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007166#if defined(BETA)
7167 IMAGE_DECLARATION(src2),
7168#endif // defined(BETA)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007169 IMAGE_DECLARATION(dst),
7170 uint src0_stride_z,
7171 uint src1_stride_z,
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007172#if defined(BETA)
7173 uint src2_stride_z,
7174#endif //defined(BETA)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007175 uint dst_stride_z
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007176#if defined(REINTERPRET_INPUT_AS_3D)
7177 ,
7178 uint src_cross_plane_pad
7179#endif // REINTERPRET_INPUT_AS_3D
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007180#if defined(REINTERPRET_OUTPUT_AS_3D)
7181 ,
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007182 uint dst_cross_plane_pad
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007183#endif // REINTERPRET_OUTPUT_AS_3D
7184 )
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007185{
7186 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
7187
7188 // Compute starting address for matrix A and Matrix B
7189 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
7190
7191 // Update address for the matrix A
7192 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
7193
7194 // Update address for the matrix B
7195 src_addr.s1 += idx * sizeof(half);
7196
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007197#if defined(REINTERPRET_INPUT_AS_3D)
7198 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
7199 // in order to take into account the presence of possible cross plane paddings
7200 //
7201 // | |
7202 // | plane0 |
7203 // | |
7204 // |__________________|
7205 // |******************|
7206 // | cross_plane_pad |
7207 // |******************|
7208 // | |
7209 // | plane1 |
7210 // | |
7211 // |__________________|
7212
7213 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
7214 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7215 zin = min(DEPTH_GEMM3D - 1, zin);
7216
7217 // Add offset due to the cross plane paddings
7218 zin *= (src_cross_plane_pad * src0_stride_y);
7219
7220 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7221 // multiply src0_stride_z by DEPTH_GEMM3D
7222 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
7223
7224#else // defined(REINTERPRET_INPUT_AS_3D)
7225
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007226 // Add offset for batched GEMM
7227 src_addr.s0 += get_global_id(2) * src0_stride_z;
7228
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007229#endif // defined(REINTERPRET_INPUT_AS_3D)
7230
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007231#if defined(MATRIX_B_DEPTH)
7232 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
7233 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
7234#else // defined(MATRIX_B_DEPTH)
7235 src_addr.s1 += get_global_id(2) * src1_stride_z;
7236#endif // defined(MATRIX_B_DEPTH)
7237
7238 half8 acc0 = 0.0h;
7239#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7240 half8 acc1 = 0.0h;
7241#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7242#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7243 half8 acc2 = 0.0h;
7244#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7245#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7246 half8 acc3 = 0.0h;
7247#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7248
7249 int i = 0;
7250 for(; i <= ((int)COLS_A - 4); i += 4)
7251 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007252#if defined(REINTERPRET_INPUT_AS_3D)
7253 // Load values from matrix A
Usama Arif0681e3b2019-04-25 14:28:07 +01007254 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
7255#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007256 // Load values from matrix A
7257 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7258#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7259 half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7260#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7261#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7262 half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7264#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7265 half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7266#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007267#endif // defined(REINTERPRET_INPUT_AS_3D)
7268
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007269 // Load values from matrix B
7270 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7271 src_addr.s1 += src1_stride_y;
7272
7273 // Accumulate
7274 acc0 = fma(b0, (half8)a0.s0, acc0);
7275#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7276 acc1 = fma(b0, (half8)a1.s0, acc1);
7277#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7278#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7279 acc2 = fma(b0, (half8)a2.s0, acc2);
7280#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7281#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7282 acc3 = fma(b0, (half8)a3.s0, acc3);
7283#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7284
7285 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7286 src_addr.s1 += src1_stride_y;
7287 acc0 = fma(b0, (half8)a0.s1, acc0);
7288#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7289 acc1 = fma(b0, (half8)a1.s1, acc1);
7290#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7291#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7292 acc2 = fma(b0, (half8)a2.s1, acc2);
7293#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7294#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7295 acc3 = fma(b0, (half8)a3.s1, acc3);
7296#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7297
7298 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7299 src_addr.s1 += src1_stride_y;
7300 acc0 = fma(b0, (half8)a0.s2, acc0);
7301#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7302 acc1 = fma(b0, (half8)a1.s2, acc1);
7303#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7304#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7305 acc2 = fma(b0, (half8)a2.s2, acc2);
7306#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7307#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7308 acc3 = fma(b0, (half8)a3.s2, acc3);
7309#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7310
7311 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7312 src_addr.s1 += src1_stride_y;
7313 acc0 = fma(b0, (half8)a0.s3, acc0);
7314#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7315 acc1 = fma(b0, (half8)a1.s3, acc1);
7316#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7317#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7318 acc2 = fma(b0, (half8)a2.s3, acc2);
7319#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7320#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7321 acc3 = fma(b0, (half8)a3.s3, acc3);
7322#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7323
7324 src_addr.s0 += 4 * sizeof(half);
7325 }
7326
7327 for(; i < (int)COLS_A; ++i)
7328 {
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007329#if defined(REINTERPRET_INPUT_AS_3D)
7330 // Load values from matrix A
7331 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
7332#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7333 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
7334#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7335#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7336 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
7337#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7338#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7339 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
7340#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7341#else // defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007342 // Load values from matrix A
7343 half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
7344#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7345 half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
7346#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7347#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7348 half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
7349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7351 half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
7352#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007353#endif // defined(REINTERPRET_INPUT_AS_3D)
7354
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007355 // Load values from matrix B
7356 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
7357
7358 src_addr += (int2)(sizeof(half), src1_stride_y);
7359
7360 // Accumulate
7361 acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
7362#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7363 acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
7364#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
7365#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7366 acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
7367#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
7368#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7369 acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
7370#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
7371 }
7372
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007373 int z = get_global_id(2);
7374
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007375 // Compute destination address
7376 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7377
7378 // Compute dst address
7379 __global uchar *dst_addr = offset(&dst, 0, 0);
7380
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007381 uint4 zout = 0;
7382
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007383#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007384
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007385 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007386 // in order to take into account the presence of possible cross plane paddings
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007387 //
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007388 // | |
7389 // | plane0 |
7390 // | |
7391 // |__________________|
7392 // |******************|
7393 // | cross_plane_pad |
7394 // |******************|
7395 // | |
7396 // | plane1 |
7397 // | |
7398 // |__________________|
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007399
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007400 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007401 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
7402 zout = min(DEPTH_GEMM3D - 1, zout);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007403
Georgios Pinitase8bd2c72018-07-11 15:54:56 +01007404 // Add offset due to the cross plane paddings
Gian Marco Iodice68a3f562018-07-26 11:44:03 +01007405 zout *= (dst_cross_plane_pad * dst_stride_y);
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007406
7407 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
7408 // multiply dst_stride_z by DEPTH_GEMM3D
7409 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01007410#else // defined(REINTERPRET_OUTPUT_AS_3D)
7411 // Add offset for batched GEMM
7412 dst_addr += z * dst_stride_z;
7413#endif // defined(REINTERPRET_OUTPUT_AS_3D)
7414
7415 // Multiply by the weight of matrix-matrix product and store the result
7416#if defined(ALPHA)
7417 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
7418#endif // defined(ALPHA)
7419
7420 // Add beta*bias
7421#if defined(BETA)
7422 REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
7423
7424#if defined(BROADCAST_BIAS)
7425 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
7426
7427 LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7428
7429#ifndef UNIT_BETA
7430 SCALE_BLOCK(1, half, bias, BETA);
7431#endif // UNIT_BIAS
7432
7433 // acc = acc + bias[broadcasted]
7434 ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
7435
7436#else // defined(BROADCAST_BIAS)
7437 __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
7438 (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
7439
7440 LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
7441
7442#ifndef UNIT_BETA
7443 SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
7444#endif // UNIT_BIAS
7445
7446 // acc = acc + bias
7447 ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
7448
7449#endif // defined(BROADCAST_BIAS)
7450#endif // defined(BETA)
7451
7452#if defined(ACTIVATION_TYPE)
7453 ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
7454#endif // defined(ACTIVATION_TYPE)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007455
7456 // Store the output block
Usama Arif0681e3b2019-04-25 14:28:07 +01007457 STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007458}
Vidhya Sudhan Loganathanbdff4912018-05-22 15:03:09 +01007459#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Gian Marco Iodicefd683112018-04-17 09:52:44 +01007460
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01007461#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007462
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007463#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007464/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7465 *
Gian Marco19835e52018-01-30 13:35:54 +00007466 * @note The beta's value need to be passed at compile time using -DBETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007467 *
7468 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
7469 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7470 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7471 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7472 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007473 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7474 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007475 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007476 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007477 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7478 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7479 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7480 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007481 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7482 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007483 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7484 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007485__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
7486 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007487{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007488 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007489 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7490 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007491
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007492 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007493 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
7494
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007495 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007496 float4 c = vload4(0, (__global float *)src.ptr);
7497
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007498 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007499 float4 out = alpha_ab + (float4)BETA * c;
7500
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007501 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007502 vstore4(out, 0, (__global float *)dst.ptr);
7503}
7504
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007505#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007506/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
7507 *
Gian Marco19835e52018-01-30 13:35:54 +00007508 * @note The beta's value need to be passed at compile time using -DBETA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007509 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007510 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
7511 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
7512 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7513 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
7514 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007515 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
7516 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007517 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007518 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007519 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7520 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7521 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7522 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007523 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
7524 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007525 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7526 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007527__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
7528 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007529{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007530 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00007531 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
7532 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007533
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007534 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007535 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
7536
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007537 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007538 half8 c = vload8(0, (__global half *)src.ptr);
7539
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007540 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007541 half8 out = alpha_ab + (half8)BETA * c;
7542
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007543 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007544 vstore8(out, 0, (__global half *)dst.ptr);
7545}
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01007546#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007547#endif // defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007548
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007549#if defined(WIDTH_VECTOR_A)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007550/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
7551 *
Gian Marco19835e52018-01-30 13:35:54 +00007552 * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007553 *
Gian Marco19835e52018-01-30 13:35:54 +00007554 * @note The input A and matrix B must not be reshaped
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007555 *
7556 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
7557 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
7558 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7559 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
7560 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7561 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007562 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007563 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
7564 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
7565 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
7566 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
7567 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
7568 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
7569 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01007570 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007571 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
7572 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
7573 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
7574 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
7575 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
7576 */
7577__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
7578 TENSOR3D_DECLARATION(src1),
7579 IMAGE_DECLARATION(dst))
7580{
7581 int idx = get_global_id(0) * 4;
7582 int idy = get_global_id(1);
7583
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007584 // Compute the address for the vector A and matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007585 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
7586 src_addr.s1 += idx * sizeof(float);
7587
7588 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
7589
7590 float4 acc = 0.0f;
7591
Georgios Pinitas96880cf2017-10-20 18:52:20 +01007592 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007593 {
7594 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
7595 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7596 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
7597
7598 acc += b0 * (float4)a0.s0;
7599 acc += b1 * (float4)a0.s1;
7600 }
7601
7602 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
7603 {
7604 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
7605 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
7606
7607 acc += b0 * (float4)a0;
7608 }
7609
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007610 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01007611 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
7612
7613 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
7614}
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00007615#endif // defined(WIDTH_VECTOR_A)