blob: 10435d376ff94f9055a736f72045cf6e5ab3a954 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Giorgio Arena7d2f69f2021-05-11 16:39:33 +01002 * Copyright (c) 2017-2021 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif0681e3b2019-04-25 14:28:07 +010024#include "gemm_helpers.h"
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +000025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010027#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +000028#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
29#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
30#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
31#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
32#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
33#define CONCAT_INC(K0) INC##K0
34#define INC(K0) CONCAT_INC(K0)
35
36#if(SRC_WIDTH % K0)
37#define BOUNDARY_CONDITION_X(x, a) \
38 ({ \
39 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
40 })
41#else // (SRC_WIDTH % K0)
42#define BOUNDARY_CONDITION_X(x, a) \
43 ({})
44#endif // (SRC_WIDTH % K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000045
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010046#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
47 ({ \
48 if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
49 { \
50 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
51 { \
52 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
53 } \
54 else \
55 { \
56 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
57 } \
58 } \
59 else \
60 { \
61 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
62 { \
63 LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
64 } \
65 else \
66 { \
67 LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
68 } \
69 } \
70 })
71
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000072/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
73 * the output matrix unrolling the values.
74 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010075 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
76 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010077 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010078 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
79 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010080 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
81 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000082 * @note Only the following values for M0, K0 and V0 are supported:
83 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +000084 * K0: 2,3,4,8,16
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000085 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010086 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000087 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
88 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
89 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
90 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
91 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
92 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010093 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000094 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
95 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
96 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
97 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
98 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
99 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
100 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
101 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
102 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
103 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
104 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
105 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
106 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
107 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
108 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
109 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
110 */
111__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
112 TENSOR3D_DECLARATION(dst)
113#if defined(REINTERPRET_INPUT_AS_3D)
114 ,
115 uint cross_plane_pad
116#endif // REINTERPRET_INPUT_AS_3D
117 )
118{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000119 // Block size
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000120#define BLOCK_SIZE ((M0) * (K0))
121
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000122 // Output offset X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000123#if defined(INTERLEAVE)
124#define OUTPUT_OFFSET_X (K0)
125#else // defined(INTERLEAVE)
126#define OUTPUT_OFFSET_X (BLOCK_SIZE)
127#endif // defined(INTERLEAVE)
128
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000129 // Output step X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000130#if defined(INTERLEAVE)
131#define OUTPUT_STEP_X (K0) * (V0)
132#else // Do not interleave
133#define OUTPUT_STEP_X (K0)
134#endif // defined(INTERLEAVE)
135
136 // Compute source and destination addresses
137 uint x = get_global_id(0);
138 uint y = get_global_id(1);
139 uint z = get_global_id(2);
140
141 // ------------------ Compute input/output addresses ---------------------------
142
143 // Compute the input address
144 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
145
146 // Compute the output address
147 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
148 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
149
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000150 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
151 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000152
153#if defined(REINTERPRET_INPUT_AS_3D)
154 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
155 // multiply src_stride_z by DEPTH_GEMM3D
156
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000157 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
158
159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100160 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000161
162#else // defined(REINTERPRET_INPUT_AS_3D)
163
164 input_ptr += z * (uint)src_stride_z;
165
166#endif // defined(REINTERPRET_INPUT_AS_3D)
167
168 // Add offset for batched GEMM
169 output_ptr += z * (uint)dst_stride_z;
170
171 // ---------------------------Load input values --------------------------------
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000172 // Load values from the LHS matrix
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100173 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
174
175 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
176
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000177 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100178 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
179 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000180
181#undef BLOCK_SIZE
182#undef OUTPUT_OFFSET_X
183#undef OUTPUT_STEP_X
184}
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000185
186#if M0 == 2
187#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
188 ({ \
189 VEC_DATA_TYPE(DATA_TYPE, M0) \
190 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
191 VSTORE(M0) \
192 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
193 })
194#elif M0 == 3 // M0 == 3
195#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
196 ({ \
197 VEC_DATA_TYPE(DATA_TYPE, M0) \
198 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
199 VSTORE(M0) \
200 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
201 })
202#elif M0 == 4 // M0 == 4
203#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
204 ({ \
205 VEC_DATA_TYPE(DATA_TYPE, M0) \
206 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
207 VSTORE(M0) \
208 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
209 })
210#elif M0 == 5 // M0 == 5
211#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
212 ({ \
213 VEC_DATA_TYPE(DATA_TYPE, 4) \
214 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
215 DATA_TYPE res1 = a4.s##i; \
216 VSTORE(4) \
217 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
218 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
219 })
220#elif M0 == 6 // M0 == 6
221#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
222 ({ \
223 VEC_DATA_TYPE(DATA_TYPE, 4) \
224 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
225 VEC_DATA_TYPE(DATA_TYPE, 2) \
226 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
227 VSTORE(4) \
228 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
229 VSTORE(2) \
230 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
231 })
232#elif M0 == 7 // M0 == 7
233#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
234 ({ \
235 VEC_DATA_TYPE(DATA_TYPE, 4) \
236 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
237 VEC_DATA_TYPE(DATA_TYPE, 3) \
238 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
239 VSTORE(4) \
240 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
241 VSTORE(3) \
242 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
243 })
244#elif M0 == 8 // M0 == 8
245#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
246 ({ \
247 VEC_DATA_TYPE(DATA_TYPE, M0) \
248 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
249 VSTORE(M0) \
250 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
251 })
252#else // M0 not supported
253#error "M0 value not supported"
254#endif // N0 conditions
255
256/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
257 * the output matrix unrolling the values.
258 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100259 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
260 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100261 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100262 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
263 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100264 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
265 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000266 * @note Only the following values for M0, K0 and V0 are supported:
267 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000268 * K0: 2,3,4,8,16
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000269 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100270 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000271 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
272 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
273 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
274 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
275 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
276 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100277 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000278 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
279 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
281 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
283 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
284 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
285 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
286 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
287 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
289 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
291 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
292 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
293 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
294 */
295__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
296 TENSOR3D_DECLARATION(dst)
297#if defined(REINTERPRET_INPUT_AS_3D)
298 ,
299 uint cross_plane_pad
300#endif // REINTERPRET_INPUT_AS_3D
301 )
302{
303 // Block size
304#define BLOCK_SIZE ((M0) * (K0))
305
306 // Output offset X
307#if defined(INTERLEAVE)
308#define OUTPUT_OFFSET_X (M0)
309#else // defined(INTERLEAVE)
310#define OUTPUT_OFFSET_X (BLOCK_SIZE)
311#endif // defined(INTERLEAVE)
312
313 // Output step X
314#if defined(INTERLEAVE)
315#define OUTPUT_STEP_X (M0) * (V0)
316#else // Do not interleave
317#define OUTPUT_STEP_X (M0)
318#endif // defined(INTERLEAVE)
319
320 // Compute source and destination addresses
321 uint x = get_global_id(0);
322 uint y = get_global_id(1);
323 uint z = get_global_id(2);
324
325 // ------------------ Compute input/output addresses ---------------------------
326
327 // Compute the input address
328 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
329
330 // Compute the output address
331 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
332 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
333
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000334 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
335 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000336
337#if defined(REINTERPRET_INPUT_AS_3D)
338 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
339 // multiply src_stride_z by DEPTH_GEMM3D
340
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000341 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
342
343 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100344 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000345
346#else // defined(REINTERPRET_INPUT_AS_3D)
347
348 input_ptr += z * (uint)src_stride_z;
349
350#endif // defined(REINTERPRET_INPUT_AS_3D)
351
352 // Add offset for batched GEMM
353 output_ptr += z * (uint)dst_stride_z;
354
355 // ---------------------------Load input values --------------------------------
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100356 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000357
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100358 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
359
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000360 // ---------------------------Transpose and store block -----------------------
361
362 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
363 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
364#if K0 > 2
365 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000366#endif // K0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000367#if K0 > 3
368 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
369#endif // K0 > 3
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000370#if K0 > 4
371 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
372 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
374 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
375#endif // K0 > 4
376#if K0 > 8
377 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
378 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
383 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
384 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
385#endif // K0 > 8
386
387#undef BLOCK_SIZE
388#undef OUTPUT_OFFSET_X
389#undef OUTPUT_STEP_X
390}
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100391#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000392
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000393#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
394/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
395 * the output matrix unrolling the values.
396 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100397 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
398 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
399 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
400 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000401 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
402 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000403 * N0: 2,3,4,8,16
404 * K0: 1,2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000405 * H0: greater than 0
406 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100407 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000408 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
409 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
410 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
411 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
412 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
413 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
414 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
415 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
416 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
417 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
419 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
421 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
423 */
424__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
425 TENSOR3D_DECLARATION(dst))
426{
427 // Block size
428#define BLOCK_SIZE ((K0) * (N0))
429
430 // Output offset X
431#if defined(INTERLEAVE)
432#define OUTPUT_OFFSET_X (N0)
433#else // defined(INTERLEAVE)
434#define OUTPUT_OFFSET_X (BLOCK_SIZE)
435#endif // defined(INTERLEAVE)
436
437 // Output step X
438#if defined(INTERLEAVE)
439#define OUTPUT_STEP_X (N0) * (H0)
440#else // Do not interleave
441#define OUTPUT_STEP_X (N0)
442#endif // defined(INTERLEAVE)
443
444 // Compute source and destination addresses
445 uint x = get_global_id(0);
446 uint y = get_global_id(1);
447 uint z = get_global_id(2);
448
449 // ------------------ Compute input/output addresses ---------------------------
450
451 // Compute the input address
452 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
453
454 // Compute the output address
455 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
456 x / (uint)H0)
457 * (uint)dst_stride_y)
458 + z * (uint)dst_stride_z;
459
460 // ---------------------------Load input values --------------------------------
461
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000462 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000463
464 // Load values from the RHS matrix
465 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
466#if K0 > 1
467 if(y * (uint)K0 + 1 < SRC_HEIGHT)
468 {
469 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
470 }
471#endif // K0 > 1
472#if K0 > 2
473 if(y * (uint)K0 + 2 < SRC_HEIGHT)
474 {
475 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
476 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000477#endif // K0 > 2
478#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000479 if(y * (uint)K0 + 3 < SRC_HEIGHT)
480 {
481 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
482 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000483#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000484#if K0 > 4
485 if(y * (uint)K0 + 4 < SRC_HEIGHT)
486 {
487 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
488 }
489 if(y * (uint)K0 + 5 < SRC_HEIGHT)
490 {
491 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
492 }
493 if(y * (uint)K0 + 6 < SRC_HEIGHT)
494 {
495 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
496 }
497 if(y * (uint)K0 + 7 < SRC_HEIGHT)
498 {
499 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
500 }
501#endif // K0 > 4
502#if K0 > 8
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000503 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000504 {
505 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
506 }
507 if(y * (uint)K0 + 9 < SRC_HEIGHT)
508 {
509 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
510 }
511 if(y * (uint)K0 + 10 < SRC_HEIGHT)
512 {
513 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
514 }
515 if(y * (uint)K0 + 11 < SRC_HEIGHT)
516 {
517 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
518 }
519 if(y * (uint)K0 + 12 < SRC_HEIGHT)
520 {
521 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
522 }
523 if(y * (uint)K0 + 13 < SRC_HEIGHT)
524 {
525 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
526 }
527 if(y * (uint)K0 + 14 < SRC_HEIGHT)
528 {
529 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
530 }
531 if(y * (uint)K0 + 15 < SRC_HEIGHT)
532 {
533 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
534 }
535#endif // K0 > 8
536
537 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100538 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
539 STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000540
541#undef BLOCK_SIZE
542#undef OUTPUT_OFFSET_X
543#undef OUTPUT_STEP_X
544}
545
546#if defined(TRANSPOSE)
547/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
548 * the output matrix unrolling the values.
549 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100550 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
551 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
552 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
553 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000554 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
555 * @note The option -DTRANSPOSE must passed at compile time.
556 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000557 * N0: 2,3,4,8,16
558 * K0: 2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000559 * H0: greater than 0
560 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100561 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000562 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
563 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
564 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
565 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
566 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
567 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
569 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
574 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
575 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
576 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
577 */
578__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
579 TENSOR3D_DECLARATION(dst))
580{
581 // Block size
582#define BLOCK_SIZE ((K0) * (N0))
583
584 // Output offset X
585#if defined(INTERLEAVE)
586#define OUTPUT_OFFSET_X (K0)
587#else // defined(INTERLEAVE)
588#define OUTPUT_OFFSET_X (BLOCK_SIZE)
589#endif // defined(INTERLEAVE)
590
591 // Output step X
592#if defined(INTERLEAVE)
593#define OUTPUT_STEP_X (K0) * (H0)
594#else // Do not interleave
595#define OUTPUT_STEP_X (K0)
596#endif // defined(INTERLEAVE)
597
598 // Compute source and destination addresses
599 uint x = get_global_id(0);
600 uint y = get_global_id(1);
601 uint z = get_global_id(2);
602
603 // ------------------ Compute input/output addresses ---------------------------
604
605 // Compute the input address
606 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
607
608 // Compute the output address
609 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
610 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
611
612 // ---------------------------Load input values --------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000613 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000614
615 // Load values from the RHS matrix
616 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
617 if(y * (uint)K0 + 1 < SRC_HEIGHT)
618 {
619 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
620 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000621#if K0 > 2
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000622 if(y * (uint)K0 + 2 < SRC_HEIGHT)
623 {
624 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
625 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000626#endif // K0 > 2
627#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000628 if(y * (uint)K0 + 3 < SRC_HEIGHT)
629 {
630 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
631 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000632#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000633#if K0 > 4
634 if(y * (uint)K0 + 4 < SRC_HEIGHT)
635 {
636 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
637 }
638 if(y * (uint)K0 + 5 < SRC_HEIGHT)
639 {
640 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
641 }
642 if(y * (uint)K0 + 6 < SRC_HEIGHT)
643 {
644 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
645 }
646 if(y * (uint)K0 + 7 < SRC_HEIGHT)
647 {
648 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
649 }
650#endif // K0 > 4
651#if K0 > 8
Gian Marco Iodice89124342018-12-19 14:17:22 +0000652 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000653 {
654 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
655 }
656 if(y * (uint)K0 + 9 < SRC_HEIGHT)
657 {
658 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
659 }
660 if(y * (uint)K0 + 10 < SRC_HEIGHT)
661 {
662 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
663 }
664 if(y * (uint)K0 + 11 < SRC_HEIGHT)
665 {
666 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
667 }
668 if(y * (uint)K0 + 12 < SRC_HEIGHT)
669 {
670 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
671 }
672 if(y * (uint)K0 + 13 < SRC_HEIGHT)
673 {
674 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
675 }
676 if(y * (uint)K0 + 14 < SRC_HEIGHT)
677 {
678 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
679 }
680 if(y * (uint)K0 + 15 < SRC_HEIGHT)
681 {
682 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
683 }
684#endif // K0 > 8
685
686 // ---------------------------Transpose the block ------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000687 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000688
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000689#if K0 == 2
690 // This part computes the following transpositions:
691 // 2x2 -> 2x2
692 // 2x4 -> 4x2
693 // 2x8 -> 8x2
694 // 2x16 -> 16x2
695 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
696 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
697#if N0 > 2
698 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
699#endif // N0 > 2
700#if N0 > 3
701 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
702#endif // N0 > 3
703#if N0 > 4
704 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
705 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
706 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
707 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
708#endif // N0 > 4
709#if N0 > 8
710 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
711 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
712 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
713 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
714 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
715 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
716 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
717 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
718#endif // N0 > 8
719
720#elif K0 == 3 // K0 == 2
721 // This part computes the following transpositions:
722 // 3x2 -> 2x3
723 // 3x4 -> 4x3
724 // 3x8 -> 8x3
725 // 3x16 -> 16x3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100726 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
727 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000728#if N0 > 2
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100729 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000730#endif // N0 > 2
731#if N0 > 3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100732 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000733#endif // N0 > 3
734#if N0 > 4
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100735 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
736 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
737 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
738 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000739#endif // N0 > 4
740#if N0 > 8
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100741 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
742 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
743 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
744 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
745 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
746 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
747 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
748 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000749#endif // N0 > 8
750
751#elif K0 == 4 // K0 == 4
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000752 // This part computes the following transpositions:
753 // 4x2 -> 2x4
754 // 4x4 -> 4x4
755 // 4x8 -> 8x4
756 // 4x16 -> 16x4
757 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
758 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
759#if N0 > 2
760 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000761#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000762#if N0 > 3
763 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
764#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000765#if N0 > 4
766 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
767 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
768 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
769 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
770#endif // N0 > 4
771#if N0 > 8
772 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
773 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
774 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
775 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
776 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
777 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
778 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
779 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
780#endif // N0 > 8
781
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000782#elif K0 == 8 // K0 == 8
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000783 // This part computes the following transpositions:
784 // 8x2 -> 2x8
785 // 8x4 -> 4x8
786 // 8x8 -> 8x8
787 // 8x16 -> 16x8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000788 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
789 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000790#if N0 > 2
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000791 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000792#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000793#if N0 > 3
794 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
795#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000796#if N0 > 4
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000797 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
798 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
799 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
800 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000801#endif // N0 > 4
802#if N0 > 8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000803 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
804 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
805 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
806 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
807 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
808 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
809 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
810 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000811#endif // N0 > 8
812
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000813#elif K0 == 16 // K0 == 16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000814
815 // This part computes the following transpositions:
816 // 16x2 -> 2x16
817 // 16x4 -> 4x16
818 // 16x8 -> 8x16
819 // 16x16 -> 16x16
820 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
821 a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
822 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
823 a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
824#if N0 > 2
825 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
826 a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000827#endif // N0 > 2
828#if N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000829 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
830 a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000831#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000832#if N0 > 4
833 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
834 a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
835 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
836 a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
837 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
838 a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
839 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
840 a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
841#endif // N0 > 4
842#if N0 > 8
843 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
844 a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
845 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
846 a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
847 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
848 a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
849 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
850 a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
851 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
852 a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
853 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
854 a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
855 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
856 a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
857 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
858 a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
859#endif // N0 > 8
860
861#else // N0 == 16
862#error "Not supported N0 value"
863#endif // N0 > 2
864
865 // ---------------------------Store the output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100866 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
867 STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000868
869#undef BLOCK_SIZE
870#undef OUTPUT_OFFSET_X
871#undef OUTPUT_STEP_X
872}
873#endif // defined(TRANSPOSE)
874#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
875
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000876#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +0000877
878#define CONCAT(a, b) a##b
879
880#define ARM_DOT1(a, b, c) \
881 ({ \
882 c = fma(a, b, c); \
883 })
884#define ARM_DOT2(a, b, c) \
885 ({ \
886 c = fma(a.s0, b.s0, c); \
887 c = fma(a.s1, b.s1, c); \
888 })
889#define ARM_DOT3(a, b, c) \
890 ({ \
891 ARM_DOT2(a, b, c); \
892 c = fma((a.s2), (b.s2), c); \
893 })
894#define ARM_DOT4(a, b, c) \
895 ({ \
896 ARM_DOT3(a, b, c); \
897 c = fma((a.s3), (b.s3), c); \
898 })
899#define ARM_DOT8(a, b, c) \
900 ({ \
901 ARM_DOT4((a.lo), (b.lo), c); \
902 ARM_DOT4((a.hi), (b.hi), c); \
903 })
904#define ARM_DOT16(a, b, c) \
905 ({ \
906 ARM_DOT8((a.lo), (b.lo), c); \
907 ARM_DOT8((a.hi), (b.hi), c); \
908 })
909
910#if N0 == 2
911#define ARM_DOT_K0XN0(k0, a, b, c) \
912 ({ \
913 CONCAT(ARM_DOT, k0) \
914 ((a), (b##0), (c.s0)); \
915 CONCAT(ARM_DOT, k0) \
916 ((a), (b##1), (c.s1)); \
917 })
918#elif N0 == 3 // N0 == 3
919#define ARM_DOT_K0XN0(k0, a, b, c) \
920 ({ \
921 CONCAT(ARM_DOT, k0) \
922 ((a), (b##0), (c.s0)); \
923 CONCAT(ARM_DOT, k0) \
924 ((a), (b##1), (c.s1)); \
925 CONCAT(ARM_DOT, k0) \
926 ((a), (b##2), (c.s2)); \
927 })
928#elif N0 == 4 // N0 == 4
929#define ARM_DOT_K0XN0(k0, a, b, c) \
930 ({ \
931 CONCAT(ARM_DOT, k0) \
932 ((a), (b##0), (c.s0)); \
933 CONCAT(ARM_DOT, k0) \
934 ((a), (b##1), (c.s1)); \
935 CONCAT(ARM_DOT, k0) \
936 ((a), (b##2), (c.s2)); \
937 CONCAT(ARM_DOT, k0) \
938 ((a), (b##3), (c.s3)); \
939 })
940#elif N0 == 8 // N0 == 8
941#define ARM_DOT_K0XN0(k0, a, b, c) \
942 ({ \
943 CONCAT(ARM_DOT, k0) \
944 ((a), (b##0), (c.s0)); \
945 CONCAT(ARM_DOT, k0) \
946 ((a), (b##1), (c.s1)); \
947 CONCAT(ARM_DOT, k0) \
948 ((a), (b##2), (c.s2)); \
949 CONCAT(ARM_DOT, k0) \
950 ((a), (b##3), (c.s3)); \
951 CONCAT(ARM_DOT, k0) \
952 ((a), (b##4), (c.s4)); \
953 CONCAT(ARM_DOT, k0) \
954 ((a), (b##5), (c.s5)); \
955 CONCAT(ARM_DOT, k0) \
956 ((a), (b##6), (c.s6)); \
957 CONCAT(ARM_DOT, k0) \
958 ((a), (b##7), (c.s7)); \
959 })
960#elif N0 == 16 // N0 == 16
961#define ARM_DOT_K0XN0(k0, a, b, c) \
962 ({ \
963 CONCAT(ARM_DOT, k0) \
964 ((a), (b##0), (c.s0)); \
965 CONCAT(ARM_DOT, k0) \
966 ((a), (b##1), (c.s1)); \
967 CONCAT(ARM_DOT, k0) \
968 ((a), (b##2), (c.s2)); \
969 CONCAT(ARM_DOT, k0) \
970 ((a), (b##3), (c.s3)); \
971 CONCAT(ARM_DOT, k0) \
972 ((a), (b##4), (c.s4)); \
973 CONCAT(ARM_DOT, k0) \
974 ((a), (b##5), (c.s5)); \
975 CONCAT(ARM_DOT, k0) \
976 ((a), (b##6), (c.s6)); \
977 CONCAT(ARM_DOT, k0) \
978 ((a), (b##7), (c.s7)); \
979 CONCAT(ARM_DOT, k0) \
980 ((a), (b##8), (c.s8)); \
981 CONCAT(ARM_DOT, k0) \
982 ((a), (b##9), (c.s9)); \
983 CONCAT(ARM_DOT, k0) \
984 ((a), (b##A), (c.sA)); \
985 CONCAT(ARM_DOT, k0) \
986 ((a), (b##B), (c.sB)); \
987 CONCAT(ARM_DOT, k0) \
988 ((a), (b##C), (c.sC)); \
989 CONCAT(ARM_DOT, k0) \
990 ((a), (b##D), (c.sD)); \
991 CONCAT(ARM_DOT, k0) \
992 ((a), (b##E), (c.sE)); \
993 CONCAT(ARM_DOT, k0) \
994 ((a), (b##F), (c.sF)); \
995 })
996#else // N0 not supported
997#error "N0 value not supported"
998#endif // N0 conditions
999
1000/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1001 * The LHS matrix is NOT reshaped
1002 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1003 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001004 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001005 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1006 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
1007 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1008 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1009 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001010 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001011 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1012 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001013 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1014 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1015 * - N0 = 2, 3, 4, 8, 16
1016 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001017 * - H0 >= 1
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001018 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001019 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001020 * The activation function is performed after the bias addition
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001021 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1022 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1023 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1024 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1025 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1026 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1027 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001028 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1029 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001030 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001031 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001032 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001033 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001034 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1035 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1036 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1037 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1038 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1039 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001040 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1041 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1042 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1044 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001046 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1047 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1048 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1050 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001052 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001053 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001054 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001055 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1056 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1057 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001058 */
1059__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1060 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001061#if defined(BETA)
1062 IMAGE_DECLARATION(bias),
1063#endif // defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001064 IMAGE_DECLARATION(dst),
1065 uint lhs_stride_z,
1066 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001067#if defined(BETA)
1068 uint bias_stride_z,
1069#endif //defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001070 uint dst_stride_z
1071#if defined(REINTERPRET_INPUT_AS_3D)
1072 ,
1073 uint lhs_cross_plane_pad
1074#endif // REINTERPRET_INPUT_AS_3D
1075#if defined(REINTERPRET_OUTPUT_AS_3D)
1076 ,
1077 uint dst_cross_plane_pad
1078#endif // REINTERPRET_OUTPUT_AS_3D
1079 )
1080{
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001081 // Block size
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001082#define RHS_BLOCK_SIZE ((K0) * (N0))
1083
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001084 // RHS offset and step X
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001085#if defined(RHS_INTERLEAVE)
1086#define RHS_OFFSET_X (K0)
1087#define RHS_STEP_X ((K0) * (H0))
1088#define RHS_STEP_LOOP (1)
1089#else // defined(RHS_INTERLEAVE)
1090#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1091#define RHS_STEP_X (K0)
1092#define RHS_STEP_LOOP (H0)
1093#endif // defined(RHS_INTERLEAVE)
1094
1095 uint x = get_global_id(0);
1096 uint y = get_global_id(1);
1097 uint z = get_global_id(2);
1098
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001099#if defined(DUMMY_WORK_ITEMS)
1100 if((x * N0 >= N) || (y * M0 >= M))
1101 {
1102 return;
1103 }
1104#endif // defined(DUMMY_WORK_ITEMS)
1105
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001106 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001107 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001108
Sheri Zhang1a378102020-04-30 12:59:39 +01001109 // Compute RHS reshaped matrix address
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001110 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1111
1112#if defined(MATRIX_B_DEPTH)
1113 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1114 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1115#else // defined(MATRIX_B_DEPTH)
1116 rhs_offset += z * rhs_stride_z;
1117#endif // defined(MATRIX_B_DEPTH)
1118
Usama Arif0681e3b2019-04-25 14:28:07 +01001119 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001120 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001121
1122#if defined(REINTERPRET_INPUT_AS_3D)
Usama Arif0681e3b2019-04-25 14:28:07 +01001123 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001124 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001125
1126 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1127 // multiply lhs_stride_z by DEPTH_GEMM3D
1128 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1129
1130#else // defined(REINTERPRET_INPUT_AS_3D)
1131
1132 // Add offset for batched GEMM
1133 lhs_offset += z * lhs_stride_z;
1134
1135#endif // defined(REINTERPRET_INPUT_AS_3D)
1136
1137 // Initialize the accumulators
1138 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1139
1140 int i = 0;
1141 for(; i <= (K - K0); i += K0)
1142 {
1143 // Supported cases (M0, K0):
1144 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1145 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1146 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1147 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1148 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1149 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1150 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1151 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1152 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001153 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001154
Sheri Zhang1a378102020-04-30 12:59:39 +01001155 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001156 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001157
1158 // Accumulate
1159 ARM_DOT_K0XN0(K0, a0, b, c0);
1160#if M0 > 1
1161 ARM_DOT_K0XN0(K0, a1, b, c1);
1162#endif // M0 > 1
1163#if M0 > 2
1164 ARM_DOT_K0XN0(K0, a2, b, c2);
1165#endif // M0 > 2
1166#if M0 > 3
1167 ARM_DOT_K0XN0(K0, a3, b, c3);
1168#endif // M0 > 3
1169#if M0 > 4
1170 ARM_DOT_K0XN0(K0, a4, b, c4);
1171#endif // M0 > 4
1172#if M0 > 5
1173 ARM_DOT_K0XN0(K0, a5, b, c5);
1174#endif // M0 > 5
1175#if M0 > 6
1176 ARM_DOT_K0XN0(K0, a6, b, c6);
1177#endif // M0 > 6
1178#if M0 > 7
1179 ARM_DOT_K0XN0(K0, a7, b, c7);
1180#endif // M0 > 7
1181
1182 lhs_offset += K0 * sizeof(DATA_TYPE);
1183 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
1184 }
1185
1186 // Left-over accumulations
1187 for(; i < K; ++i)
1188 {
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001189 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001190 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001191
Sheri Zhang1a378102020-04-30 12:59:39 +01001192 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001193 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001194
1195 // Accumulate
1196 ARM_DOT_K0XN0(1, a0, b, c0);
1197#if M0 > 1
1198 ARM_DOT_K0XN0(1, a1, b, c1);
1199#endif // M0 > 1
1200#if M0 > 2
1201 ARM_DOT_K0XN0(1, a2, b, c2);
1202#endif // M0 > 2
1203#if M0 > 3
1204 ARM_DOT_K0XN0(1, a3, b, c3);
1205#endif // M0 > 3
1206#if M0 > 4
1207 ARM_DOT_K0XN0(1, a4, b, c4);
1208#endif // M0 > 4
1209#if M0 > 5
1210 ARM_DOT_K0XN0(1, a5, b, c5);
1211#endif // M0 > 5
1212#if M0 > 6
1213 ARM_DOT_K0XN0(1, a6, b, c6);
1214#endif // M0 > 6
1215#if M0 > 7
1216 ARM_DOT_K0XN0(1, a7, b, c7);
1217#endif // M0 > 7
1218
1219 lhs_offset += sizeof(DATA_TYPE);
1220 rhs_offset += sizeof(DATA_TYPE);
1221 }
1222
SiCong Li406a13f2020-07-15 12:09:58 +01001223 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001224
1225 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1226
1227#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001228
1229 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001230 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001231
1232 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1233 // multiply dst_stride_z by DEPTH_GEMM3D
1234 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1235
1236#else // defined(REINTERPRET_OUTPUT_AS_3D)
1237
1238 // Add offset for batched GEMM
1239 dst_addr += z * dst_stride_z;
1240
1241#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1242
1243 // Multiply by the weight of matrix-matrix product and store the result
1244#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001245 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001246#endif // defined(ALPHA)
1247
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001248 // Add beta*bias
1249#if defined(BETA)
1250#if defined(BROADCAST_BIAS)
1251 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1252
1253 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1254
1255#ifndef UNIT_BETA
1256 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1257#endif // UNIT_BIAS
1258
1259 // c = c + bias[broadcasted]
1260 ADD_BLOCK_BROADCAST(M0, c, bias0);
1261
1262#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001263 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001264
1265 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1266
1267#ifndef UNIT_BETA
1268 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1269#endif // UNIT_BIAS
1270
1271 // c = c + bias
1272 ADD_BLOCK(M0, c, bias);
1273
1274#endif // defined(BROADCAST_BIAS)
1275#endif // defined(BETA)
1276
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001277#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001278 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001279#endif // defined(ACTIVATION_TYPE)
1280
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001281 const bool cond_y = y == 0;
1282 const bool cond_x = ((x + 1) * N0 >= N);
1283
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001284 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001285 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001286
1287#undef RHS_BLOCK_SIZE
1288#undef RHS_OFFSET_X
1289#undef RHS_STEP_X
1290}
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001291
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001292#if defined(OPENCL_IMAGE_SUPPORT)
1293/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
1294 * The LHS matrix is NOT reshaped
1295 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1296 *
1297 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
1298 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1299 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1300 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
1301 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
1302 * could be different from the value returned by get_image_height(rhs_img).
1303 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1304 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1305 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
1306 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001307 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1308 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001309 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1310 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1311 * - N0 = 4, 8, 16
1312 * - K0 = 4, 8, 16
1313 * - H0 >= 1
1314 *
1315 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
1316 * The activation function is performed after the bias addition
1317 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1318 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1319 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1320 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1321 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1322 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1323 *
1324 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
1325 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
1326 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1327 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
1328 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1329 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
1330 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
1331 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1332 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1333 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1334 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1335 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1336 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1337 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1338 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1339 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1340 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1341 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1342 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1343 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
1344 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1345 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
1346 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1347 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1348 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1349 */
1350__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
1351 __read_only image2d_t rhs_img,
1352#if defined(BETA)
1353 IMAGE_DECLARATION(bias),
1354#endif // defined(BETA)
1355 IMAGE_DECLARATION(dst),
1356 uint lhs_stride_z,
1357 uint rhs_stride_z,
1358#if defined(BETA)
1359 uint bias_stride_z,
1360#endif //defined(BETA)
1361 uint dst_stride_z
1362#if defined(REINTERPRET_INPUT_AS_3D)
1363 ,
1364 uint lhs_cross_plane_pad
1365#endif // REINTERPRET_INPUT_AS_3D
1366#if defined(REINTERPRET_OUTPUT_AS_3D)
1367 ,
1368 uint dst_cross_plane_pad
1369#endif // REINTERPRET_OUTPUT_AS_3D
1370 )
1371{
1372 // Pixel unit
1373#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
1374
1375#define LEFTOVER_K (K % K0)
1376
1377 // Block size
1378#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
1379
1380 // RHS offset and step X
1381#if defined(RHS_INTERLEAVE)
1382#define RHS_OFFSET_X (PIXEL_UNIT)
1383#define RHS_STEP_X (PIXEL_UNIT * (H0))
1384#define RHS_STEP_LOOP (1)
1385#else // defined(RHS_INTERLEAVE)
1386#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1387#define RHS_STEP_X PIXEL_UNIT
1388#define RHS_STEP_LOOP (H0)
1389#endif // defined(RHS_INTERLEAVE)
1390
1391 uint x = get_global_id(0);
1392 uint y = get_global_id(1);
1393 uint z = get_global_id(2);
1394
1395#if defined(DUMMY_WORK_ITEMS)
1396 if((x * N0 >= N) || (y * M0 >= M))
1397 {
1398 return;
1399 }
1400#endif // defined(DUMMY_WORK_ITEMS)
1401
1402 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001403 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001404
1405#if defined(MATRIX_B_DEPTH)
1406 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1407 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
1408#else // defined(MATRIX_B_DEPTH)
1409 const uint z_rhs = get_global_id(2);
1410#endif // defined(MATRIX_B_DEPTH)
1411
1412 // Compute RHS matrix coordinates
1413 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
1414 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
1415
1416 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
1417 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
1418
1419#if defined(REINTERPRET_INPUT_AS_3D)
1420 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001421 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001422
1423 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1424 // multiply lhs_stride_z by DEPTH_GEMM3D
1425 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1426
1427#else // defined(REINTERPRET_INPUT_AS_3D)
1428
1429 // Add offset for batched GEMM
1430 lhs_offset += z * lhs_stride_z;
1431
1432#endif // defined(REINTERPRET_INPUT_AS_3D)
1433
1434 // Initialize the accumulators
1435 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
1436
1437 int i = 0;
1438 for(; i <= (K - K0); i += K0)
1439 {
1440 // Load values from LHS matrix
1441 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1442
1443 // Load values from RHS matrix stored in a cl_image
1444 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1445 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1446
1447 // Accumulate
1448 ARM_DOT_K0XN0(K0, a0, b, c0);
1449#if M0 > 1
1450 ARM_DOT_K0XN0(K0, a1, b, c1);
1451#endif // M0 > 1
1452#if M0 > 2
1453 ARM_DOT_K0XN0(K0, a2, b, c2);
1454#endif // M0 > 2
1455#if M0 > 3
1456 ARM_DOT_K0XN0(K0, a3, b, c3);
1457#endif // M0 > 3
1458#if M0 > 4
1459 ARM_DOT_K0XN0(K0, a4, b, c4);
1460#endif // M0 > 4
1461#if M0 > 5
1462 ARM_DOT_K0XN0(K0, a5, b, c5);
1463#endif // M0 > 5
1464#if M0 > 6
1465 ARM_DOT_K0XN0(K0, a6, b, c6);
1466#endif // M0 > 6
1467#if M0 > 7
1468 ARM_DOT_K0XN0(K0, a7, b, c7);
1469#endif // M0 > 7
1470
1471 lhs_offset += K0 * sizeof(DATA_TYPE);
1472 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1473 }
1474
1475#if LEFTOVER_K != 0
1476 // Note: We cannot read out-of-bound elements from the RHS matrix because
1477 // the RHS width is always multiple of K0. This is not be true for the LHS matrix
1478
1479 union UNION_VEC_TYPE
1480 {
1481 DATA_TYPE s[K0];
1482 VEC_DATA_TYPE(DATA_TYPE, K0)
1483 v;
1484 };
1485
1486 union UNION_VEC_TYPE a0 = {.v = 0 };
1487#if M0 > 1
1488 union UNION_VEC_TYPE a1 = {.v = 0 };
1489#endif // M0 > 1
1490#if M0 > 2
1491 union UNION_VEC_TYPE a2 = {.v = 0 };
1492#endif // M0 > 2
1493#if M0 > 3
1494 union UNION_VEC_TYPE a3 = {.v = 0 };
1495#endif // M0 > 3
1496#if M0 > 4
1497 union UNION_VEC_TYPE a4 = {.v = 0 };
1498#endif // M0 > 4
1499#if M0 > 5
1500 union UNION_VEC_TYPE a5 = {.v = 0 };
1501#endif // M0 > 5
1502#if M0 > 6
1503 union UNION_VEC_TYPE a6 = {.v = 0 };
1504#endif // M0 > 6
1505#if M0 > 7
1506 union UNION_VEC_TYPE a7 = {.v = 0 };
1507#endif // M0 > 7
1508
1509 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1510
1511 // Load from RHS matrix
1512 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1513
1514 // Load from LHS matrix
1515 for(int k = 0; k < LEFTOVER_K; ++k)
1516 {
1517 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
1518#if M0 > 1
1519 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
1520#endif // M0 > 1
1521#if M0 > 2
1522 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
1523#endif // M0 > 2
1524#if M0 > 3
1525 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
1526#endif // M0 > 3
1527#if M0 > 4
1528 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
1529#endif // M0 > 4
1530#if M0 > 5
1531 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
1532#endif // M0 > 5
1533#if M0 > 6
1534 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
1535#endif // M0 > 6
1536#if M0 > 7
1537 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
1538#endif // M0 > 7
1539
1540 lhs_offset += sizeof(DATA_TYPE);
1541 }
1542
1543 // Accumulate
1544 ARM_DOT_K0XN0(K0, a0.v, b, c0);
1545#if M0 > 1
1546 ARM_DOT_K0XN0(K0, a1.v, b, c1);
1547#endif // M0 > 1
1548#if M0 > 2
1549 ARM_DOT_K0XN0(K0, a2.v, b, c2);
1550#endif // M0 > 2
1551#if M0 > 3
1552 ARM_DOT_K0XN0(K0, a3.v, b, c3);
1553#endif // M0 > 3
1554#if M0 > 4
1555 ARM_DOT_K0XN0(K0, a4.v, b, c4);
1556#endif // M0 > 4
1557#if M0 > 5
1558 ARM_DOT_K0XN0(K0, a5.v, b, c5);
1559#endif // M0 > 5
1560#if M0 > 6
1561 ARM_DOT_K0XN0(K0, a6.v, b, c6);
1562#endif // M0 > 6
1563#if M0 > 7
1564 ARM_DOT_K0XN0(K0, a7.v, b, c7);
1565#endif // M0 > 7
1566
1567#endif // LEFTOVER_K != 0
1568
SiCong Li406a13f2020-07-15 12:09:58 +01001569 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001570
1571 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1572
1573#if defined(REINTERPRET_OUTPUT_AS_3D)
1574
1575 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001576 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001577
1578 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1579 // multiply dst_stride_z by DEPTH_GEMM3D
1580 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1581
1582#else // defined(REINTERPRET_OUTPUT_AS_3D)
1583
1584 // Add offset for batched GEMM
1585 dst_addr += z * dst_stride_z;
1586
1587#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1588
1589 // Multiply by the weight of matrix-matrix product and store the result
1590#if defined(ALPHA)
1591 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
1592#endif // defined(ALPHA)
1593
1594 // Add beta*bias
1595#if defined(BETA)
1596#if defined(BROADCAST_BIAS)
1597 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1598
1599 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1600
1601#ifndef UNIT_BETA
1602 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1603#endif // UNIT_BIAS
1604
1605 // c = c + bias[broadcasted]
1606 ADD_BLOCK_BROADCAST(M0, c, bias0);
1607
1608#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001609 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001610
1611 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1612
1613#ifndef UNIT_BETA
1614 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1615#endif // UNIT_BIAS
1616
1617 // c = c + bias
1618 ADD_BLOCK(M0, c, bias);
1619
1620#endif // defined(BROADCAST_BIAS)
1621#endif // defined(BETA)
1622
1623#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001624 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001625#endif // defined(ACTIVATION_TYPE)
1626
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001627 const bool cond_y = y == 0;
1628 const bool cond_x = ((x + 1) * N0 >= N);
1629
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001630 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001631 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001632
1633#undef RHS_BLOCK_SIZE
1634#undef RHS_OFFSET_X
1635#undef RHS_STEP_X
1636#undef LEFTOVER_K
1637#undef PIXEL_UNIT
1638}
1639#endif // defined(OPENCL_IMAGE_SUPPORT)
1640
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001641#define VFMA(a, b, c) \
1642 ({ \
1643 c = fma(a, b, c); \
1644 })
1645
1646#if M0 == 1
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001647#define VFMA_M0xN0(i, a, b, c) \
1648 ({ \
1649 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001650 })
1651#elif M0 == 2 // M0 == 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001652#define VFMA_M0xN0(i, a, b, c) \
1653 ({ \
1654 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1655 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001656 })
1657#elif M0 == 3 // M0 == 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001658#define VFMA_M0xN0(i, a, b, c) \
1659 ({ \
1660 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1661 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1662 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001663 })
1664#elif M0 == 4 // M0 == 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001665#define VFMA_M0xN0(i, a, b, c) \
1666 ({ \
1667 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1668 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1669 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1670 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001671 })
1672#elif M0 == 5 // M0 == 5
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001673#define VFMA_M0xN0(i, a, b, c) \
1674 ({ \
1675 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1676 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1677 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1678 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1679 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001680 })
1681#elif M0 == 6 // M0 == 6
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001682#define VFMA_M0xN0(i, a, b, c) \
1683 ({ \
1684 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1685 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1686 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1687 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1688 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1689 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001690 })
1691#elif M0 == 7 // M0 == 7
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001692#define VFMA_M0xN0(i, a, b, c) \
1693 ({ \
1694 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1695 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1696 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1697 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1698 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1699 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1700 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001701 })
1702#elif M0 == 8 // M0 == 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001703#define VFMA_M0xN0(i, a, b, c) \
1704 ({ \
1705 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1706 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1707 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1708 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1709 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1710 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1711 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
1712 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001713 })
1714#else // M0 not supported
1715#error "M0 not supported"
1716#endif // M0 not supported
1717
1718/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1719 * The LHS matrix is NOT reshaped
1720 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
1721 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001722 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001723 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
1724 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1725 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1726 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001727 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001728 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1729 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001730 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1731 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1732 * - N0 = 2, 3, 4, 8, 16
1733 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001734 * - H0 >= 1
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001735 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001736 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001737 * The activation function is performed after the bias addition
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001738 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1739 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1740 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1741 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1742 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1743 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1744 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001745 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1746 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001747 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001748 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001749 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001750 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001751 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1752 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1753 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1754 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1755 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1756 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001757 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1758 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001759 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001760 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001761 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1762 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1763 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1764 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1765 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1766 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1767 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1768 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001769 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001770 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001771 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001772 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1773 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1774 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001775 */
1776__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
1777 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001778#if defined(BETA)
1779 IMAGE_DECLARATION(bias),
1780#endif // defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001781 IMAGE_DECLARATION(dst),
1782 uint lhs_stride_z,
1783 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001784#if defined(BETA)
1785 uint bias_stride_z,
1786#endif //defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001787 uint dst_stride_z
1788#if defined(REINTERPRET_INPUT_AS_3D)
1789 ,
1790 uint lhs_cross_plane_pad
1791#endif // REINTERPRET_INPUT_AS_3D
1792#if defined(REINTERPRET_OUTPUT_AS_3D)
1793 ,
1794 uint dst_cross_plane_pad
1795#endif // REINTERPRET_OUTPUT_AS_3D
1796 )
1797{
1798 // Block size
1799#define RHS_BLOCK_SIZE ((K0) * (N0))
1800
1801 // RHS offset and step X
1802#if defined(RHS_INTERLEAVE)
1803#define RHS_OFFSET_X (N0)
1804#define RHS_STEP_X ((N0) * (H0))
1805#define RHS_STEP_LOOP (1)
1806#else // defined(RHS_INTERLEAVE)
1807#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1808#define RHS_STEP_X (N0)
1809#define RHS_STEP_LOOP (H0)
1810#endif // defined(RHS_INTERLEAVE)
1811
1812 uint x = get_global_id(0);
1813 uint y = get_global_id(1);
1814 uint z = get_global_id(2);
1815
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001816#if defined(DUMMY_WORK_ITEMS)
1817 if((x * N0 >= N) || (y * M0 >= M))
1818 {
1819 return;
1820 }
1821#endif // defined(DUMMY_WORK_ITEMS)
1822
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001823 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001824 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001825
Sheri Zhang1a378102020-04-30 12:59:39 +01001826 // Compute RHS reshaped matrix address
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001827 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1828
1829#if defined(MATRIX_B_DEPTH)
1830 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1831 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1832#else // defined(MATRIX_B_DEPTH)
1833 rhs_offset += z * rhs_stride_z;
1834#endif // defined(MATRIX_B_DEPTH)
1835
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001836 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
1837 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001838
1839#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001840
1841 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001842 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001843
1844 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1845 // multiply lhs_stride_z by DEPTH_GEMM3D
1846 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1847
1848#else // defined(REINTERPRET_INPUT_AS_3D)
1849
1850 // Add offset for batched GEMM
1851 lhs_offset += z * lhs_stride_z;
1852
1853#endif // defined(REINTERPRET_INPUT_AS_3D)
1854
1855 // Initialize the accumulators
1856 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1857
1858 int i = 0;
1859 for(; i <= (K - K0); i += K0)
1860 {
1861 // Supported cases (M0, K0):
1862 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1863 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1864 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1865 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1866 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1867 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1868 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1869 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1870 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001871 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001872
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001873 VEC_DATA_TYPE(DATA_TYPE, N0)
1874 b0;
1875
1876 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1877 VFMA_M0xN0(0, a, b0, c);
1878 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
1879 VFMA_M0xN0(1, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001880#if K0 > 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001881 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
1882 VFMA_M0xN0(2, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001883#endif // K0 > 2
1884#if K0 > 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001885 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
1886 VFMA_M0xN0(3, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001887#endif // K0 > 3
1888#if K0 > 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001889 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
1890 VFMA_M0xN0(4, a, b0, c);
1891 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
1892 VFMA_M0xN0(5, a, b0, c);
1893 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
1894 VFMA_M0xN0(6, a, b0, c);
1895 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
1896 VFMA_M0xN0(7, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001897#endif // K0 > 4
1898#if K0 > 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001899 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
1900 VFMA_M0xN0(8, a, b0, c);
1901 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
1902 VFMA_M0xN0(9, a, b0, c);
1903 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
1904 VFMA_M0xN0(A, a, b0, c);
1905 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
1906 VFMA_M0xN0(B, a, b0, c);
1907 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
1908 VFMA_M0xN0(C, a, b0, c);
1909 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
1910 VFMA_M0xN0(D, a, b0, c);
1911 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
1912 VFMA_M0xN0(E, a, b0, c);
1913 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
1914 VFMA_M0xN0(F, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001915#endif // K0 > 8
1916
1917 lhs_offset += K0 * sizeof(DATA_TYPE);
1918 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
1919 }
1920
1921 // Left-over accumulations
1922 for(; i < K; ++i)
1923 {
1924 // Load values from LHS matrix
1925 VEC_DATA_TYPE(DATA_TYPE, 2)
1926 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
1927#if M0 > 1
1928 VEC_DATA_TYPE(DATA_TYPE, 2)
1929 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
1930#endif // M0 > 1
1931#if M0 > 2
1932 VEC_DATA_TYPE(DATA_TYPE, 2)
1933 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
1934#endif // M0 > 2
1935#if M0 > 3
1936 VEC_DATA_TYPE(DATA_TYPE, 2)
1937 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
1938#endif // M0 > 3
1939#if M0 > 4
1940 VEC_DATA_TYPE(DATA_TYPE, 2)
1941 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
1942#endif // M0 > 4
1943#if M0 > 5
1944 VEC_DATA_TYPE(DATA_TYPE, 2)
1945 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
1946#endif // M0 > 5
1947#if M0 > 6
1948 VEC_DATA_TYPE(DATA_TYPE, 2)
1949 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
1950#endif // M0 > 6
1951#if M0 > 7
1952 VEC_DATA_TYPE(DATA_TYPE, 2)
giuros01b3204e72019-04-01 13:50:22 +01001953 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001954#endif // M0 > 7
1955
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001956 VEC_DATA_TYPE(DATA_TYPE, N0)
1957 b0;
1958
1959 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1960 VFMA_M0xN0(0, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001961
1962 lhs_offset += sizeof(DATA_TYPE);
1963 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
1964 }
1965
SiCong Li406a13f2020-07-15 12:09:58 +01001966 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001967
1968 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1969
1970#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001971 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001972 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001973
1974 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1975 // multiply dst_stride_z by DEPTH_GEMM3D
1976 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1977
1978#else // defined(REINTERPRET_OUTPUT_AS_3D)
1979
1980 // Add offset for batched GEMM
1981 dst_addr += z * dst_stride_z;
1982
1983#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1984
1985 // Multiply by the weight of matrix-matrix product and store the result
1986#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001987 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001988#endif // defined(ALPHA)
1989
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001990 // Add beta*bias
1991#if defined(BETA)
1992#if defined(BROADCAST_BIAS)
1993 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1994
1995 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1996
1997#ifndef UNIT_BETA
1998 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1999#endif // UNIT_BIAS
2000
2001 // c = c + bias[broadcasted]
2002 ADD_BLOCK_BROADCAST(M0, c, bias0);
2003
2004#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002005 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01002006
2007 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2008
2009#ifndef UNIT_BETA
2010 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2011#endif // UNIT_BIAS
2012
2013 // c = c + bias
2014 ADD_BLOCK(M0, c, bias);
2015
2016#endif // defined(BROADCAST_BIAS)
2017#endif // defined(BETA)
2018
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002019#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002020 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002021#endif // defined(ACTIVATION_TYPE)
2022
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002023 const bool cond_y = y == 0;
2024 const bool cond_x = ((x + 1) * N0 >= N);
2025
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002026 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002027 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002028
2029#undef RHS_BLOCK_SIZE
2030#undef RHS_OFFSET_X
2031#undef RHS_STEP_X
2032}
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002033
2034#if defined(OPENCL_IMAGE_SUPPORT)
2035/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2036 * The LHS matrix is NOT reshaped
2037 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
2038 *
2039 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2040 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2041 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
2042 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2043 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2044 * could be different from the value returned by get_image_height(rhs_img).
2045 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
2046 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
2047 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2048 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01002049 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2050 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002051 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2052 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
2053 * - N0 = 4, 8, 16
2054 * - K0 = 4, 8, 16
2055 * - H0 >= 1
2056 *
2057 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2058 * The activation function is performed after the bias addition
2059 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
2060 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
2061 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2062 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2063 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2064 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
2065 *
2066 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
2067 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
2068 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2069 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
2070 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2071 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
2072 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2073 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2074 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2075 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2076 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2077 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2078 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2079 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2080 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2081 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2082 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2083 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2084 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2085 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
2086 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2087 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2088 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2089 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
2090 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2091 */
2092__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
2093 __read_only image2d_t rhs_img,
2094#if defined(BETA)
2095 IMAGE_DECLARATION(bias),
2096#endif // defined(BETA)
2097 IMAGE_DECLARATION(dst),
2098 uint lhs_stride_z,
2099 uint rhs_stride_z,
2100#if defined(BETA)
2101 uint bias_stride_z,
2102#endif //defined(BETA)
2103 uint dst_stride_z
2104#if defined(REINTERPRET_INPUT_AS_3D)
2105 ,
2106 uint lhs_cross_plane_pad
2107#endif // REINTERPRET_INPUT_AS_3D
2108#if defined(REINTERPRET_OUTPUT_AS_3D)
2109 ,
2110 uint dst_cross_plane_pad
2111#endif // REINTERPRET_OUTPUT_AS_3D
2112 )
2113{
2114 // Pixel unit
2115#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
2116
2117 // Block size
2118#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
2119
2120 // RHS offset and step X
2121#if defined(RHS_INTERLEAVE)
2122#define RHS_OFFSET_X (PIXEL_UNIT)
2123#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
2124#else // defined(RHS_INTERLEAVE)
2125#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2126#define RHS_STEP_X (PIXEL_UNIT)
2127#endif // defined(RHS_INTERLEAVE)
2128
2129 uint x = get_global_id(0);
2130 uint y = get_global_id(1);
2131 uint z = get_global_id(2);
2132
2133#if defined(DUMMY_WORK_ITEMS)
2134 if((x * N0 >= N) || (y * M0 >= M))
2135 {
2136 return;
2137 }
2138#endif // defined(DUMMY_WORK_ITEMS)
2139
2140 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01002141 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002142
2143#if defined(MATRIX_B_DEPTH)
2144 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2145 const uint z_rhs = (z % MATRIX_B_DEPTH);
2146#else // defined(MATRIX_B_DEPTH)
2147 const uint z_rhs = z;
2148#endif // defined(MATRIX_B_DEPTH)
2149
2150 // Compute RHS matrix coordinates
2151 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
2152 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
2153
2154 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
2155 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2156
2157#if defined(REINTERPRET_INPUT_AS_3D)
2158
2159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01002160 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002161
2162 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2163 // multiply lhs_stride_z by DEPTH_GEMM3D
2164 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
2165
2166#else // defined(REINTERPRET_INPUT_AS_3D)
2167
2168 // Add offset for batched GEMM
2169 lhs_offset += z * lhs_stride_z;
2170
2171#endif // defined(REINTERPRET_INPUT_AS_3D)
2172
2173 // Initialize the accumulators
2174 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
2175
2176 int i = 0;
2177 for(; i <= (K - K0); i += K0)
2178 {
2179 // Load values from LHS matrix
2180 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
2181
2182 VEC_DATA_TYPE(DATA_TYPE, N0)
2183 b0;
2184
2185 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2186 VFMA_M0xN0(0, a, b0, c);
2187 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
2188 VFMA_M0xN0(1, a, b0, c);
2189#if K0 > 2
2190 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
2191 VFMA_M0xN0(2, a, b0, c);
2192#endif // K0 > 2
2193#if K0 > 3
2194 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
2195 VFMA_M0xN0(3, a, b0, c);
2196#endif // K0 > 3
2197#if K0 > 4
2198 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
2199 VFMA_M0xN0(4, a, b0, c);
2200 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
2201 VFMA_M0xN0(5, a, b0, c);
2202 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
2203 VFMA_M0xN0(6, a, b0, c);
2204 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
2205 VFMA_M0xN0(7, a, b0, c);
2206#endif // K0 > 4
2207#if K0 > 8
2208 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
2209 VFMA_M0xN0(8, a, b0, c);
2210 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
2211 VFMA_M0xN0(9, a, b0, c);
2212 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
2213 VFMA_M0xN0(A, a, b0, c);
2214 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
2215 VFMA_M0xN0(B, a, b0, c);
2216 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
2217 VFMA_M0xN0(C, a, b0, c);
2218 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
2219 VFMA_M0xN0(D, a, b0, c);
2220 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
2221 VFMA_M0xN0(E, a, b0, c);
2222 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
2223 VFMA_M0xN0(F, a, b0, c);
2224#endif // K0 > 8
2225
2226 lhs_offset += K0 * sizeof(DATA_TYPE);
2227 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
2228 }
2229
2230 // Left-over accumulations
2231 for(; i < K; ++i)
2232 {
2233 // Load values from LHS matrix
2234 VEC_DATA_TYPE(DATA_TYPE, 2)
2235 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
2236#if M0 > 1
2237 VEC_DATA_TYPE(DATA_TYPE, 2)
2238 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
2239#endif // M0 > 1
2240#if M0 > 2
2241 VEC_DATA_TYPE(DATA_TYPE, 2)
2242 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
2243#endif // M0 > 2
2244#if M0 > 3
2245 VEC_DATA_TYPE(DATA_TYPE, 2)
2246 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
2247#endif // M0 > 3
2248#if M0 > 4
2249 VEC_DATA_TYPE(DATA_TYPE, 2)
2250 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
2251#endif // M0 > 4
2252#if M0 > 5
2253 VEC_DATA_TYPE(DATA_TYPE, 2)
2254 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
2255#endif // M0 > 5
2256#if M0 > 6
2257 VEC_DATA_TYPE(DATA_TYPE, 2)
2258 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
2259#endif // M0 > 6
2260#if M0 > 7
2261 VEC_DATA_TYPE(DATA_TYPE, 2)
2262 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
2263#endif // M0 > 7
2264
2265 VEC_DATA_TYPE(DATA_TYPE, N0)
2266 b0;
2267 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2268
2269 VFMA_M0xN0(0, a, b0, c);
2270
2271 lhs_offset += sizeof(DATA_TYPE);
2272 x_rhs += RHS_STEP_X;
2273 }
2274
SiCong Li406a13f2020-07-15 12:09:58 +01002275 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002276
2277 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
2278
2279#if defined(REINTERPRET_OUTPUT_AS_3D)
2280 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01002281 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002282
2283 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2284 // multiply dst_stride_z by DEPTH_GEMM3D
2285 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2286
2287#else // defined(REINTERPRET_OUTPUT_AS_3D)
2288
2289 // Add offset for batched GEMM
2290 dst_addr += z * dst_stride_z;
2291
2292#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2293
2294 // Multiply by the weight of matrix-matrix product and store the result
2295#if defined(ALPHA)
2296 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2297#endif // defined(ALPHA)
2298
2299 // Add beta*bias
2300#if defined(BETA)
2301#if defined(BROADCAST_BIAS)
2302 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2303
2304 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2305
2306#ifndef UNIT_BETA
2307 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2308#endif // UNIT_BIAS
2309
2310 // c = c + bias[broadcasted]
2311 ADD_BLOCK_BROADCAST(M0, c, bias0);
2312
2313#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002314 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002315
2316 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2317
2318#ifndef UNIT_BETA
2319 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2320#endif // UNIT_BIAS
2321
2322 // c = c + bias
2323 ADD_BLOCK(M0, c, bias);
2324
2325#endif // defined(BROADCAST_BIAS)
2326#endif // defined(BETA)
2327
2328#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002329 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002330#endif // defined(ACTIVATION_TYPE)
2331
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002332 const bool cond_y = y == 0;
2333 const bool cond_x = ((x + 1) * N0 >= N);
2334
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002335 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002336 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002337
2338#undef RHS_BLOCK_SIZE
2339#undef RHS_OFFSET_X
2340#undef RHS_STEP_X
2341}
2342#endif // defined(OPENCL_IMAGE_SUPPORT)
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002343#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002344
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002345#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002346
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002347#if defined(MIXED_PRECISION)
2348#if K0 == 2
2349#define ARM_DOT_K0(a, b, c) \
2350 ({ \
2351 c += a.s0 * b.s0; \
2352 c += a.s1 * b.s1; \
2353 })
2354#elif K0 == 3 // K0 == 3
2355#define ARM_DOT_K0(a, b, c) \
2356 ({ \
2357 c += a.s0 * b.s0; \
2358 c += a.s1 * b.s1; \
2359 c += a.s2 * b.s2; \
2360 })
2361#elif K0 == 4 // K0 == 4
2362#define ARM_DOT_K0(a, b, c) \
2363 ({ \
2364 c += a.s0 * b.s0; \
2365 c += a.s1 * b.s1; \
2366 c += a.s2 * b.s2; \
2367 c += a.s3 * b.s3; \
2368 })
2369#elif K0 == 8 // K0 == 8
2370#define ARM_DOT_K0(a, b, c) \
2371 ({ \
2372 c += a.s0 * b.s0; \
2373 c += a.s1 * b.s1; \
2374 c += a.s2 * b.s2; \
2375 c += a.s3 * b.s3; \
2376 c += a.s4 * b.s4; \
2377 c += a.s5 * b.s5; \
2378 c += a.s6 * b.s6; \
2379 c += a.s7 * b.s7; \
2380 })
2381#elif K0 == 16 // K0 == 16
2382#define ARM_DOT_K0(a, b, c) \
2383 ({ \
2384 c += a.s0 * b.s0; \
2385 c += a.s1 * b.s1; \
2386 c += a.s2 * b.s2; \
2387 c += a.s3 * b.s3; \
2388 c += a.s4 * b.s4; \
2389 c += a.s5 * b.s5; \
2390 c += a.s6 * b.s6; \
2391 c += a.s7 * b.s7; \
2392 c += a.s8 * b.s8; \
2393 c += a.s9 * b.s9; \
2394 c += a.sA * b.sA; \
2395 c += a.sB * b.sB; \
2396 c += a.sC * b.sC; \
2397 c += a.sD * b.sD; \
2398 c += a.sE * b.sE; \
2399 c += a.sF * b.sF; \
2400 })
2401#else // K0 not supported
2402#error "K0 value not supported"
2403#endif // K0 conditions
2404#else // defined(MIXED_PRECISION)
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002405#if K0 == 2
2406#define ARM_DOT_K0(a, b, c) \
2407 ({ \
2408 c = fma(a.s0, b.s0, c); \
2409 c = fma(a.s1, b.s1, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002410 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002411#elif K0 == 3 // K0 == 3
2412#define ARM_DOT_K0(a, b, c) \
2413 ({ \
2414 c = fma(a.s0, b.s0, c); \
2415 c = fma(a.s1, b.s1, c); \
2416 c = fma(a.s2, b.s2, c); \
2417 })
2418#elif K0 == 4 // K0 == 4
2419#define ARM_DOT_K0(a, b, c) \
2420 ({ \
2421 c = fma(a.s0, b.s0, c); \
2422 c = fma(a.s1, b.s1, c); \
2423 c = fma(a.s2, b.s2, c); \
2424 c = fma(a.s3, b.s3, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002425 })
2426#elif K0 == 8 // K0 == 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002427#define ARM_DOT_K0(a, b, c) \
2428 ({ \
2429 c = fma(a.s0, b.s0, c); \
2430 c = fma(a.s1, b.s1, c); \
2431 c = fma(a.s2, b.s2, c); \
2432 c = fma(a.s3, b.s3, c); \
2433 c = fma(a.s4, b.s4, c); \
2434 c = fma(a.s5, b.s5, c); \
2435 c = fma(a.s6, b.s6, c); \
2436 c = fma(a.s7, b.s7, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002437 })
2438#elif K0 == 16 // K0 == 16
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002439#define ARM_DOT_K0(a, b, c) \
2440 ({ \
2441 c = fma(a.s0, b.s0, c); \
2442 c = fma(a.s1, b.s1, c); \
2443 c = fma(a.s2, b.s2, c); \
2444 c = fma(a.s3, b.s3, c); \
2445 c = fma(a.s4, b.s4, c); \
2446 c = fma(a.s5, b.s5, c); \
2447 c = fma(a.s6, b.s6, c); \
2448 c = fma(a.s7, b.s7, c); \
2449 c = fma(a.s8, b.s8, c); \
2450 c = fma(a.s9, b.s9, c); \
2451 c = fma(a.sA, b.sA, c); \
2452 c = fma(a.sB, b.sB, c); \
2453 c = fma(a.sC, b.sC, c); \
2454 c = fma(a.sD, b.sD, c); \
2455 c = fma(a.sE, b.sE, c); \
2456 c = fma(a.sF, b.sF, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002457 })
2458#else // K0 not supported
2459#error "K0 value not supported"
2460#endif // K0 conditions
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002461#endif // defined(MIXED_PRECISION)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002462
Giorgio Arena7d2f69f2021-05-11 16:39:33 +01002463#if defined(ARM_DOT_K0XN0)
2464#undef ARM_DOT_K0XN0
2465#endif // defined(ARM_DOT_K0XN0)
2466
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002467#if N0 == 2
2468#define ARM_DOT_K0XN0(a, b, c) \
2469 ({ \
2470 ARM_DOT_K0((a), (b##0), (c.s0)); \
2471 ARM_DOT_K0((a), (b##1), (c.s1)); \
2472 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002473#elif N0 == 3 // N0 == 3
2474#define ARM_DOT_K0XN0(a, b, c) \
2475 ({ \
2476 ARM_DOT_K0((a), (b##0), (c.s0)); \
2477 ARM_DOT_K0((a), (b##1), (c.s1)); \
2478 ARM_DOT_K0((a), (b##2), (c.s2)); \
2479 })
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002480#elif N0 == 4 // N0 == 4
2481#define ARM_DOT_K0XN0(a, b, c) \
2482 ({ \
2483 ARM_DOT_K0((a), (b##0), (c.s0)); \
2484 ARM_DOT_K0((a), (b##1), (c.s1)); \
2485 ARM_DOT_K0((a), (b##2), (c.s2)); \
2486 ARM_DOT_K0((a), (b##3), (c.s3)); \
2487 })
2488#elif N0 == 8 // N0 == 8
2489#define ARM_DOT_K0XN0(a, b, c) \
2490 ({ \
2491 ARM_DOT_K0((a), (b##0), (c.s0)); \
2492 ARM_DOT_K0((a), (b##1), (c.s1)); \
2493 ARM_DOT_K0((a), (b##2), (c.s2)); \
2494 ARM_DOT_K0((a), (b##3), (c.s3)); \
2495 ARM_DOT_K0((a), (b##4), (c.s4)); \
2496 ARM_DOT_K0((a), (b##5), (c.s5)); \
2497 ARM_DOT_K0((a), (b##6), (c.s6)); \
2498 ARM_DOT_K0((a), (b##7), (c.s7)); \
2499 })
2500#elif N0 == 16 // N0 == 16
2501#define ARM_DOT_K0XN0(a, b, c) \
2502 ({ \
2503 ARM_DOT_K0((a), (b##0), (c.s0)); \
2504 ARM_DOT_K0((a), (b##1), (c.s1)); \
2505 ARM_DOT_K0((a), (b##2), (c.s2)); \
2506 ARM_DOT_K0((a), (b##3), (c.s3)); \
2507 ARM_DOT_K0((a), (b##4), (c.s4)); \
2508 ARM_DOT_K0((a), (b##5), (c.s5)); \
2509 ARM_DOT_K0((a), (b##6), (c.s6)); \
2510 ARM_DOT_K0((a), (b##7), (c.s7)); \
2511 ARM_DOT_K0((a), (b##8), (c.s8)); \
2512 ARM_DOT_K0((a), (b##9), (c.s9)); \
2513 ARM_DOT_K0((a), (b##A), (c.sA)); \
2514 ARM_DOT_K0((a), (b##B), (c.sB)); \
2515 ARM_DOT_K0((a), (b##C), (c.sC)); \
2516 ARM_DOT_K0((a), (b##D), (c.sD)); \
2517 ARM_DOT_K0((a), (b##E), (c.sE)); \
2518 ARM_DOT_K0((a), (b##F), (c.sF)); \
2519 })
2520#else // N0 not supported
2521#error "N0 value not supported"
2522#endif // N0 conditions
2523
2524/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2525 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2526 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2527 *
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002528 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2529 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2530 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002531 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002532 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002533 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2534 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2535 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002536 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2537 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002538 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2539 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002540 * @note Only the following configurations of M0, N0 and K0 are currently supported:
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01002541 * - M0 = 2, 3, 4, 5, 6, 7, 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002542 * - N0 = 2, 3, 4, 8, 16
2543 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00002544 * - V0 >= 1
2545 * - H0 >= 1
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002546 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002547 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002548 * The activation function is performed after the bias addition
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002549 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002550 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2551 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2552 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2553 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2554 *
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002555 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
2556 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2557 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2558 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2559 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2560 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2561 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
2562 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
2563 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2564 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
2565 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2566 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
2567 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2568 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2569 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2570 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2571 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2572 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2573 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2574 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2575 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2576 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2577 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2578 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002579 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002580 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2581 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2582 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2583 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2584 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002585 */
2586__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
2587 IMAGE_DECLARATION(rhs),
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002588#if defined(BETA)
2589 IMAGE_DECLARATION(bias),
2590#endif // defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002591 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002592 uint k,
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002593 uint lhs_stride_z,
2594 uint rhs_stride_z,
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002595#if defined(BETA)
2596 uint bias_stride_z,
2597#endif //defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002598 uint dst_stride_z
2599#if defined(REINTERPRET_OUTPUT_AS_3D)
2600 ,
2601 uint dst_cross_plane_pad
2602#endif // REINTERPRET_OUTPUT_AS_3D
2603 )
2604{
2605 // Block size
2606#define LHS_BLOCK_SIZE ((K0) * (M0))
2607
2608#if defined(LHS_INTERLEAVE)
2609#define LHS_OFFSET_X (K0)
2610#define LHS_STEP_X ((K0) * (V0))
2611#define LHS_STEP_LOOP (1)
2612#else // defined(INTERLEAVE)
2613#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2614#define LHS_STEP_X (K0)
2615#define LHS_STEP_LOOP (V0)
2616#endif // defined(INTERLEAVE)
2617
2618 // Block size
2619#define RHS_BLOCK_SIZE ((K0) * (N0))
2620
2621 // RHS offset and step X
2622#if defined(RHS_INTERLEAVE)
2623#define RHS_OFFSET_X (K0)
2624#define RHS_STEP_X ((K0) * (H0))
2625#define RHS_STEP_LOOP (1)
2626#else // defined(RHS_INTERLEAVE)
2627#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2628#define RHS_STEP_X (K0)
2629#define RHS_STEP_LOOP (H0)
2630#endif // defined(RHS_INTERLEAVE)
2631
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002632#if defined(DUMMY_WORK_ITEMS)
2633 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2634 {
2635 return;
2636 }
2637#endif // defined(DUMMY_WORK_ITEMS)
2638
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002639 // Compute LHS matrix address
2640 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2641 (get_global_id(2) * lhs_stride_z);
2642
2643 // Compute RHS matrix address
2644 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
2645
2646#if defined(MATRIX_B_DEPTH)
2647 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2648 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
2649#else // defined(MATRIX_B_DEPTH)
2650 rhs_addr += get_global_id(2) * rhs_stride_z;
2651#endif // defined(MATRIX_B_DEPTH)
2652
2653 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002654 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002655
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002656 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2657 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Usama Arif0681e3b2019-04-25 14:28:07 +01002658
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002659 for(int i = 0; i < k; i += K0)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002660 {
2661 // Supported cases (M0, K0):
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002662 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
2663 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
2664 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
2665 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
2666 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
2667 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
2668 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
2669 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002670 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01002671 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002672
2673 // Load values from RHS matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002674 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002675
2676 // Accumulate
2677 ARM_DOT_K0XN0(a0, b, c0);
2678#if M0 > 1
2679 ARM_DOT_K0XN0(a1, b, c1);
2680#endif // M0 > 1
2681#if M0 > 2
2682 ARM_DOT_K0XN0(a2, b, c2);
2683#endif // M0 > 2
2684#if M0 > 3
2685 ARM_DOT_K0XN0(a3, b, c3);
2686#endif // M0 > 3
2687#if M0 > 4
2688 ARM_DOT_K0XN0(a4, b, c4);
2689#endif // M0 > 4
2690#if M0 > 5
2691 ARM_DOT_K0XN0(a5, b, c5);
2692#endif // M0 > 5
2693#if M0 > 6
2694 ARM_DOT_K0XN0(a6, b, c6);
2695#endif // M0 > 6
2696#if M0 > 7
2697 ARM_DOT_K0XN0(a7, b, c7);
2698#endif // M0 > 7
2699
2700 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2701 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
2702 }
2703
2704 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2705
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002706 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002707
2708#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002709
2710 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00002711 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002712 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2713 // multiply dst_stride_z by DEPTH_GEMM3D
2714 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2715
2716#else // defined(REINTERPRET_OUTPUT_AS_3D)
2717
2718 // Add offset for batched GEMM
2719 dst_addr += get_global_id(2) * dst_stride_z;
2720
2721#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2722
2723 // Multiply by the weight of matrix-matrix product and store the result
2724#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01002725 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002726#endif // defined(ALPHA)
2727
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002728 // Add beta*bias
2729#if defined(BETA)
2730#if defined(BROADCAST_BIAS)
2731 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2732
2733 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2734
2735#ifndef UNIT_BETA
2736 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2737#endif // UNIT_BIAS
2738
2739 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002740#if defined(MIXED_PRECISION)
2741 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2742 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2743#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002744 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002745#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002746
2747#else // defined(BROADCAST_BIAS)
2748 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2749 2) * bias_stride_z;
2750
2751 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2752
2753#ifndef UNIT_BETA
2754 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2755#endif // UNIT_BIAS
2756
2757 // c = c + bias
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002758#if defined(MIXED_PRECISION)
2759 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2760 ADD_BLOCK(M0, c, bias_hp);
2761#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002762 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002763#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002764
2765#endif // defined(BROADCAST_BIAS)
2766#endif // defined(BETA)
2767
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002768#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002769#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002770 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002771#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002772 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002773#endif // defined(MIXED_PRECISION)
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002774#endif // defined(ACTIVATION_TYPE)
2775
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002776 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
2777 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
2778
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002779 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002780#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002781 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002782 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002783#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002784 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002785#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002786
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002787#undef LHS_BLOCK_SIZE
2788#undef LHS_OFFSET_X
2789#undef LHS_STEP_X
2790#undef RHS_BLOCK_SIZE
2791#undef RHS_OFFSET_X
2792#undef RHS_STEP_X
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002793#undef LHS_STEP_LOOP
2794#undef RHS_STEP_LOOP
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002795}
giuros01b3204e72019-04-01 13:50:22 +01002796
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002797#if defined(OPENCL_IMAGE_SUPPORT)
2798/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
2799 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2800 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2801 *
2802 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2803 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2804 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2805 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
2806 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2807 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002808 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2809 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2810 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002811 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2812 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2813 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2814 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2815 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002816 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2817 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002818 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2819 * - M0 = 2, 3, 4, 5, 6, 7, 8
2820 * - N0 = 4, 8, 16
2821 * - K0 = 4, 8, 16
2822 * - V0 >= 1
2823 * - H0 >= 1
2824 *
2825 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2826 * The activation function is performed after the bias addition
2827 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
2828 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2829 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2830 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2831 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2832 *
2833 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
2834 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2835 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2836 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2837 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2838 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2839 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2840 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2841 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2842 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2843 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2844 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2845 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2846 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2847 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2848 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2849 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2850 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2851 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002852 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002853 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2854 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2855 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2856 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2857 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2858 */
2859__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
2860 __read_only image2d_t rhs_img,
2861#if defined(BETA)
2862 IMAGE_DECLARATION(bias),
2863#endif // defined(BETA)
2864 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002865 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002866 uint lhs_stride_z,
2867 uint rhs_stride_z,
2868#if defined(BETA)
2869 uint bias_stride_z,
2870#endif //defined(BETA)
2871 uint dst_stride_z
2872#if defined(REINTERPRET_OUTPUT_AS_3D)
2873 ,
2874 uint dst_cross_plane_pad
2875#endif // REINTERPRET_OUTPUT_AS_3D
2876 )
2877{
2878 // Pixel unit
2879#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
2880
2881 // Block size
2882#define LHS_BLOCK_SIZE ((K0) * (M0))
2883
2884#if defined(LHS_INTERLEAVE)
2885#define LHS_OFFSET_X (K0)
2886#define LHS_STEP_X ((K0) * (V0))
2887#define LHS_STEP_LOOP (1)
2888#else // defined(INTERLEAVE)
2889#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2890#define LHS_STEP_X (K0)
2891#define LHS_STEP_LOOP (V0)
2892#endif // defined(INTERLEAVE)
2893
2894 // Block size
2895#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
2896
2897 // RHS offset and step X
2898#if defined(RHS_INTERLEAVE)
2899#define RHS_OFFSET_X (PIXEL_UNIT)
2900#define RHS_STEP_X (PIXEL_UNIT * (H0))
2901#define RHS_STEP_LOOP (1)
2902#else // defined(RHS_INTERLEAVE)
2903#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2904#define RHS_STEP_X PIXEL_UNIT
2905#define RHS_STEP_LOOP (H0)
2906#endif // defined(RHS_INTERLEAVE)
2907
2908#if defined(DUMMY_WORK_ITEMS)
2909 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2910 {
2911 return;
2912 }
2913#endif // defined(DUMMY_WORK_ITEMS)
2914
2915 // Compute LHS matrix address
2916 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2917 (get_global_id(2) * lhs_stride_z);
2918
2919#if defined(MATRIX_B_DEPTH)
2920 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2921 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
2922#else // defined(MATRIX_B_DEPTH)
2923 const uint z_rhs = get_global_id(2);
2924#endif // defined(MATRIX_B_DEPTH)
2925
2926 // Compute RHS matrix coordinates
2927 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
2928 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
2929
2930 // Initialize the accumulators
2931 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
2932
2933 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2934 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2935
2936 for(int i = 0; i < K; i += K0)
2937 {
2938 // Load values from LHS matrix
2939 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
2940
2941 // Load values from RHS matrix stored in a cl_image
2942 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
2943 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
2944
2945 // Accumulate
2946 ARM_DOT_K0XN0(a0, b, c0);
2947#if M0 > 1
2948 ARM_DOT_K0XN0(a1, b, c1);
2949#endif // M0 > 1
2950#if M0 > 2
2951 ARM_DOT_K0XN0(a2, b, c2);
2952#endif // M0 > 2
2953#if M0 > 3
2954 ARM_DOT_K0XN0(a3, b, c3);
2955#endif // M0 > 3
2956#if M0 > 4
2957 ARM_DOT_K0XN0(a4, b, c4);
2958#endif // M0 > 4
2959#if M0 > 5
2960 ARM_DOT_K0XN0(a5, b, c5);
2961#endif // M0 > 5
2962#if M0 > 6
2963 ARM_DOT_K0XN0(a6, b, c6);
2964#endif // M0 > 6
2965#if M0 > 7
2966 ARM_DOT_K0XN0(a7, b, c7);
2967#endif // M0 > 7
2968
2969 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2970
2971 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
2972 }
2973
2974 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2975
2976 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
2977
2978#if defined(REINTERPRET_OUTPUT_AS_3D)
2979
2980 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00002981 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002982 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2983 // multiply dst_stride_z by DEPTH_GEMM3D
2984 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2985
2986#else // defined(REINTERPRET_OUTPUT_AS_3D)
2987
2988 // Add offset for batched GEMM
2989 dst_addr += get_global_id(2) * dst_stride_z;
2990
2991#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2992
2993 // Multiply by the weight of matrix-matrix product and store the result
2994#if defined(ALPHA)
2995 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2996#endif // defined(ALPHA)
2997
2998 // Add beta*bias
2999#if defined(BETA)
3000#if defined(BROADCAST_BIAS)
3001 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
3002
3003 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3004
3005#ifndef UNIT_BETA
3006 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3007#endif // UNIT_BIAS
3008
3009 // c = c + bias[broadcasted]
3010#if defined(MIXED_PRECISION)
3011 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3012 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3013#else // defined(MIXED_PRECISION)
3014 ADD_BLOCK_BROADCAST(M0, c, bias0);
3015#endif // defined(MIXED_PRECISION)
3016
3017#else // defined(BROADCAST_BIAS)
3018 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3019 2) * bias_stride_z;
3020
3021 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3022
3023#ifndef UNIT_BETA
3024 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3025#endif // UNIT_BIAS
3026
3027 // c = c + bias
3028#if defined(MIXED_PRECISION)
3029 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3030 ADD_BLOCK(M0, c, bias_hp);
3031#else // defined(MIXED_PRECISION)
3032 ADD_BLOCK(M0, c, bias);
3033#endif // defined(MIXED_PRECISION)
3034
3035#endif // defined(BROADCAST_BIAS)
3036#endif // defined(BETA)
3037
3038#if defined(ACTIVATION_TYPE)
3039#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003040 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003041#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003042 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003043#endif // defined(MIXED_PRECISION)
3044#endif // defined(ACTIVATION_TYPE)
3045
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003046 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3047 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3048
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003049 // Store output block
3050#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003051 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003052 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003053#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003054 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003055#endif // defined(MIXED_PRECISION)
3056
3057#undef LHS_BLOCK_SIZE
3058#undef LHS_OFFSET_X
3059#undef LHS_STEP_X
3060#undef RHS_BLOCK_SIZE
3061#undef RHS_OFFSET_X
3062#undef RHS_STEP_X
3063#undef PIXEL_UNIT
3064#undef LHS_STEP_LOOP
3065#undef RHS_STEP_LOOP
3066}
3067#endif // defined(OPENCL_IMAGE_SUPPORT)
3068
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003069#if defined(LHS_TRANSPOSE)
3070
3071#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
3072
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003073#if defined(MIXED_PRECISION)
3074
3075#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3076#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003077#else // GPU_ARCH == GPU_ARCH_MIDGARD
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003078#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003079#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3080
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003081#else // defined(MIXED_PRECISION
3082
3083#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3084#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
3085#else // GPU_ARCH == GPU_ARCH_MIDGARD
3086#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
3087#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3088
3089#endif // defined(MIXED_PRECISION)
3090
3091#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
3092 ({ \
3093 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003094 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003095#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
3096 ({ \
3097 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
3098 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003099 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003100#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
3101 ({ \
3102 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
3103 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003104 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003105#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
3106 ({ \
3107 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
3108 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003109 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003110#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
3111 ({ \
3112 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
3113 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
3114 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
3115 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
3116 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003117 })
3118
3119// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
3120// a is the column-vector (transposed)
3121// b is the row-vector (not transposed)
3122// C is the output matrix
3123// Lower case is a vector (a, b)
3124// Upper case is a matrix (C)
3125#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
3126
3127#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
3128 ({ \
3129 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
3130 })
3131#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
3132 ({ \
3133 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
3134 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
3135 })
3136#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
3137 ({ \
3138 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
3139 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
3140 })
3141#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
3142 ({ \
3143 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
3144 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
3145 })
3146#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
3147 ({ \
3148 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
3149 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
3150 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
3151 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
3152 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
3153 })
3154#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
3155 ({ \
3156 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
3157 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
3158 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
3159 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
3160 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
3161 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
3162 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
3163 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
3164 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
3165 })
3166
3167// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
3168// The dimensions for this matrix multiplications are defined through M0, N0 and K0
3169// The dimensions supported are:
3170// M0: 1, 2, 3, 4, 8
3171// N0: 1, 2, 3, 4, 8, 16
3172// K0: 1, 2, 3, 4, 8, 16
3173// This macro calls the vector-by-matrix macro K0 times
3174// A, B and C are matrices
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003175#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
3176 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003177 (M0, N0, TYPE, A, B, C)
3178
3179/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3180 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3181 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3182 *
3183 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
3184 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003185 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003186 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3187 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3188 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3189 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3190 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003191 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3192 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003193 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3194 * - M0 = 2, 3, 4, 8
3195 * - N0 = 2, 3, 4, 8, 16
3196 * - K0 = 2, 3, 4, 8, 16
3197 * - V0 >= 1
3198 * - H0 >= 1
3199 *
3200 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3201 * The activation function is performed after the bias addition
3202 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3203 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3204 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3205 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3206 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3207 *
3208 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
3209 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3210 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3211 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3212 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3213 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3214 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
3215 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
3216 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3217 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
3218 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3219 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
3220 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3221 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3222 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3223 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3224 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3225 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3226 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3227 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3228 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3229 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3230 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3231 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003232 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003233 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3234 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3235 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3236 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3237 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3238 */
3239__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
3240 IMAGE_DECLARATION(rhs),
3241#if defined(BETA)
3242 IMAGE_DECLARATION(bias),
3243#endif // defined(BETA)
3244 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003245 uint k,
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003246 uint lhs_stride_z,
3247 uint rhs_stride_z,
3248#if defined(BETA)
3249 uint bias_stride_z,
3250#endif //defined(BETA)
3251 uint dst_stride_z
3252#if defined(REINTERPRET_OUTPUT_AS_3D)
3253 ,
3254 uint dst_cross_plane_pad
3255#endif // REINTERPRET_OUTPUT_AS_3D
3256 )
3257{
3258 // Block size
3259#define LHS_BLOCK_SIZE ((K0) * (M0))
3260
3261#if defined(LHS_INTERLEAVE)
3262#define LHS_OFFSET_X (M0)
3263#define LHS_STEP_X ((M0) * (V0))
3264#define LHS_STEP_LOOP (1)
3265#else // defined(INTERLEAVE)
3266#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3267#define LHS_STEP_X (M0)
3268#define LHS_STEP_LOOP (V0)
3269#endif // defined(INTERLEAVE)
3270
3271 // Block size
3272#define RHS_BLOCK_SIZE ((K0) * (N0))
3273
3274 // RHS offset and step X
3275#if defined(RHS_INTERLEAVE)
3276#define RHS_OFFSET_X (N0)
3277#define RHS_STEP_X ((N0) * (H0))
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003278#else // defined(RHS_INTERLEAVE)
3279#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3280#define RHS_STEP_X (N0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003281#endif // defined(RHS_INTERLEAVE)
3282
3283 const uint x = get_global_id(0);
3284 const uint y = get_global_id(1);
3285 const uint z = get_global_id(2);
3286
3287#if defined(DUMMY_WORK_ITEMS)
3288 if((x * N0 >= N) || (y * M0 >= M))
3289 {
3290 return;
3291 }
3292#endif // defined(DUMMY_WORK_ITEMS)
3293
3294 // Compute LHS matrix address
3295 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3296
3297 // Compute RHS matrix address
3298 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
3299
3300#if defined(MATRIX_B_DEPTH)
3301 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3302 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
3303#else // defined(MATRIX_B_DEPTH)
3304 rhs_addr += z * rhs_stride_z;
3305#endif // defined(MATRIX_B_DEPTH)
3306
3307 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003308 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003309
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003310 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3311
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003312 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3313 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
3314
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003315 for(int i = 0; i < k; i += K0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003316 {
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003317 VEC_DATA_TYPE(DATA_TYPE, M0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003318 a0;
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003319 VEC_DATA_TYPE(DATA_TYPE, N0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003320 b0;
3321
3322 a0 = VLOAD(M0)(0, lhs);
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003323 b0 = VLOAD(N0)(0, rhs);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003324
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003325 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003326
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003327 lhs += LHS_STEP_X;
3328 rhs += RHS_STEP_X;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003329
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003330#if K0 > 1
3331 a0 = VLOAD(M0)(0, lhs);
3332 b0 = VLOAD(N0)(0, rhs);
3333
3334 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3335
3336 lhs += LHS_STEP_X;
3337 rhs += RHS_STEP_X;
3338#endif // K0 > 1
3339
3340#if K0 > 2
3341 a0 = VLOAD(M0)(0, lhs);
3342 b0 = VLOAD(N0)(0, rhs);
3343
3344 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3345
3346 lhs += LHS_STEP_X;
3347 rhs += RHS_STEP_X;
3348#endif // K0 > 2
3349
3350#if K0 > 3
3351 a0 = VLOAD(M0)(0, lhs);
3352 b0 = VLOAD(N0)(0, rhs);
3353
3354 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3355
3356 lhs += LHS_STEP_X;
3357 rhs += RHS_STEP_X;
3358#endif // K0 > 3
3359
3360#if K0 > 4
3361 a0 = VLOAD(M0)(0, lhs);
3362 b0 = VLOAD(N0)(0, rhs);
3363
3364 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3365
3366 lhs += LHS_STEP_X;
3367 rhs += RHS_STEP_X;
3368
3369 a0 = VLOAD(M0)(0, lhs);
3370 b0 = VLOAD(N0)(0, rhs);
3371
3372 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3373
3374 lhs += LHS_STEP_X;
3375 rhs += RHS_STEP_X;
3376
3377 a0 = VLOAD(M0)(0, lhs);
3378 b0 = VLOAD(N0)(0, rhs);
3379
3380 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3381
3382 lhs += LHS_STEP_X;
3383 rhs += RHS_STEP_X;
3384
3385 a0 = VLOAD(M0)(0, lhs);
3386 b0 = VLOAD(N0)(0, rhs);
3387
3388 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3389
3390 lhs += LHS_STEP_X;
3391 rhs += RHS_STEP_X;
3392#endif // K0 > 4
3393
3394#if K0 > 8
3395 a0 = VLOAD(M0)(0, lhs);
3396 b0 = VLOAD(N0)(0, rhs);
3397
3398 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3399
3400 lhs += LHS_STEP_X;
3401 rhs += RHS_STEP_X;
3402
3403 a0 = VLOAD(M0)(0, lhs);
3404 b0 = VLOAD(N0)(0, rhs);
3405
3406 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3407
3408 lhs += LHS_STEP_X;
3409 rhs += RHS_STEP_X;
3410
3411 a0 = VLOAD(M0)(0, lhs);
3412 b0 = VLOAD(N0)(0, rhs);
3413
3414 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3415
3416 lhs += LHS_STEP_X;
3417 rhs += RHS_STEP_X;
3418
3419 a0 = VLOAD(M0)(0, lhs);
3420 b0 = VLOAD(N0)(0, rhs);
3421
3422 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3423
3424 lhs += LHS_STEP_X;
3425 rhs += RHS_STEP_X;
3426
3427 a0 = VLOAD(M0)(0, lhs);
3428 b0 = VLOAD(N0)(0, rhs);
3429
3430 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3431
3432 lhs += LHS_STEP_X;
3433 rhs += RHS_STEP_X;
3434
3435 a0 = VLOAD(M0)(0, lhs);
3436 b0 = VLOAD(N0)(0, rhs);
3437
3438 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3439
3440 lhs += LHS_STEP_X;
3441 rhs += RHS_STEP_X;
3442
3443 a0 = VLOAD(M0)(0, lhs);
3444 b0 = VLOAD(N0)(0, rhs);
3445
3446 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3447
3448 lhs += LHS_STEP_X;
3449 rhs += RHS_STEP_X;
3450
3451 a0 = VLOAD(M0)(0, lhs);
3452 b0 = VLOAD(N0)(0, rhs);
3453
3454 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3455
3456 lhs += LHS_STEP_X;
3457 rhs += RHS_STEP_X;
3458#endif // K0 > 8
3459
3460#ifndef LHS_INTERLEAVE
3461 lhs += (M0 * K0 * (V0 - 1));
3462#endif // LHS_INTERLEAVE
3463
3464#ifndef RHS_INTERLEAVE
3465 rhs += (N0 * K0 * (H0 - 1));
3466#endif // RHS_INTERLEAVE
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003467 }
3468
3469 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3470
3471 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3472
3473#if defined(REINTERPRET_OUTPUT_AS_3D)
3474
3475 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00003476 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003477 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3478 // multiply dst_stride_z by DEPTH_GEMM3D
3479 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3480
3481#else // defined(REINTERPRET_OUTPUT_AS_3D)
3482
3483 // Add offset for batched GEMM
3484 dst_addr += z * dst_stride_z;
3485
3486#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3487
3488 // Multiply by the weight of matrix-matrix product and store the result
3489#if defined(ALPHA)
3490 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3491#endif // defined(ALPHA)
3492
3493 // Add beta*bias
3494#if defined(BETA)
3495#if defined(BROADCAST_BIAS)
3496 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3497
3498 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3499
3500#ifndef UNIT_BETA
3501 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3502#endif // UNIT_BIAS
3503
3504 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003505#if defined(MIXED_PRECISION)
3506 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3507 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3508#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003509 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003510#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003511
3512#else // defined(BROADCAST_BIAS)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003513 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3514 2) * bias_stride_z;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003515
3516 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3517
3518#ifndef UNIT_BETA
3519 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3520#endif // UNIT_BIAS
3521
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003522#if defined(MIXED_PRECISION)
3523 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3524 ADD_BLOCK(M0, c, bias_hp);
3525#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003526 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003527#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003528
3529#endif // defined(BROADCAST_BIAS)
3530#endif // defined(BETA)
3531
3532#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003533#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003534 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003535#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003536 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003537#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003538#endif // defined(ACTIVATION_TYPE)
3539
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003540 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3541 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3542
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003543 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003544#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003545 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003546 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003547#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003548 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003549#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003550
3551#undef LHS_BLOCK_SIZE
3552#undef LHS_OFFSET_X
3553#undef LHS_STEP_X
3554#undef RHS_BLOCK_SIZE
3555#undef RHS_OFFSET_X
3556#undef RHS_STEP_X
3557}
3558
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003559#if defined(OPENCL_IMAGE_SUPPORT)
3560/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
3561 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3562 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3563 *
3564 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
3565 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003566 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
3567 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01003568 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
3569 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
3570 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003571 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3572 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3573 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3574 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3575 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003576 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3577 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003578 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3579 * - M0 = 2, 3, 4, 8
3580 * - N0 = 4, 8, 16
3581 * - K0 = 4, 8, 16
3582 * - V0 >= 1
3583 * - H0 >= 1
3584 *
3585 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3586 * The activation function is performed after the bias addition
3587 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3588 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3589 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3590 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3591 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3592 *
3593 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
3594 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3595 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3596 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3597 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3598 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3599 * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
3600 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3601 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3602 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3603 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3604 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3605 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3606 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3607 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3608 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3609 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3610 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3611 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003612 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003613 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3614 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3615 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3616 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3617 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3618 */
3619__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
3620 __read_only image2d_t rhs_img,
3621#if defined(BETA)
3622 IMAGE_DECLARATION(bias),
3623#endif // defined(BETA)
3624 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003625 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003626 uint lhs_stride_z,
3627 uint rhs_stride_z,
3628#if defined(BETA)
3629 uint bias_stride_z,
3630#endif //defined(BETA)
3631 uint dst_stride_z
3632#if defined(REINTERPRET_OUTPUT_AS_3D)
3633 ,
3634 uint dst_cross_plane_pad
3635#endif // REINTERPRET_OUTPUT_AS_3D
3636 )
3637{
3638 // Pixel unit
3639#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
3640
3641 // Block size
3642#define LHS_BLOCK_SIZE ((K0) * (M0))
3643
3644#if defined(LHS_INTERLEAVE)
3645#define LHS_OFFSET_X (M0)
3646#define LHS_STEP_X ((M0) * (V0))
3647#define LHS_STEP_LOOP (1)
3648#else // defined(INTERLEAVE)
3649#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3650#define LHS_STEP_X (M0)
3651#define LHS_STEP_LOOP (V0)
3652#endif // defined(INTERLEAVE)
3653
3654 // Block size
3655#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
3656
3657 // RHS offset and step X
3658#if defined(RHS_INTERLEAVE)
3659#define RHS_OFFSET_X (PIXEL_UNIT)
3660#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
3661#else // defined(RHS_INTERLEAVE)
3662#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3663#define RHS_STEP_X (PIXEL_UNIT)
3664#endif // defined(RHS_INTERLEAVE)
3665
3666 const uint x = get_global_id(0);
3667 const uint y = get_global_id(1);
3668 const uint z = get_global_id(2);
3669
3670#if defined(DUMMY_WORK_ITEMS)
3671 if((x * N0 >= N) || (y * M0 >= M))
3672 {
3673 return;
3674 }
3675#endif // defined(DUMMY_WORK_ITEMS)
3676
3677 // Compute LHS matrix address
3678 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3679
3680#if defined(MATRIX_B_DEPTH)
3681 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3682 const uint z_rhs = (z % MATRIX_B_DEPTH);
3683#else // defined(MATRIX_B_DEPTH)
3684 const uint z_rhs = z;
3685#endif // defined(MATRIX_B_DEPTH)
3686
3687 // Compute RHS matrix coordinates
3688 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
3689 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
3690
3691 // Initialize the accumulators
3692 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
3693
3694 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3695
3696 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3697
3698 for(int i = 0; i < K; i += K0)
3699 {
3700 VEC_DATA_TYPE(DATA_TYPE, M0)
3701 a0;
3702 VEC_DATA_TYPE(DATA_TYPE, N0)
3703 b0;
3704
3705 a0 = VLOAD(M0)(0, lhs);
3706 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
3707
3708 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3709
3710 lhs += LHS_STEP_X;
3711
3712#if K0 > 1
3713 a0 = VLOAD(M0)(0, lhs);
3714 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
3715
3716 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3717
3718 lhs += LHS_STEP_X;
3719#endif // K0 > 1
3720
3721#if K0 > 2
3722 a0 = VLOAD(M0)(0, lhs);
3723 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
3724
3725 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3726
3727 lhs += LHS_STEP_X;
3728#endif // K0 > 2
3729
3730#if K0 > 3
3731 a0 = VLOAD(M0)(0, lhs);
3732 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
3733
3734 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3735
3736 lhs += LHS_STEP_X;
3737#endif // K0 > 3
3738
3739#if K0 > 4
3740 a0 = VLOAD(M0)(0, lhs);
3741 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
3742
3743 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3744
3745 lhs += LHS_STEP_X;
3746
3747 a0 = VLOAD(M0)(0, lhs);
3748 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
3749
3750 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3751
3752 lhs += LHS_STEP_X;
3753
3754 a0 = VLOAD(M0)(0, lhs);
3755 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
3756
3757 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3758
3759 lhs += LHS_STEP_X;
3760
3761 a0 = VLOAD(M0)(0, lhs);
3762 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
3763
3764 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3765
3766 lhs += LHS_STEP_X;
3767#endif // K0 > 4
3768
3769#if K0 > 8
3770 a0 = VLOAD(M0)(0, lhs);
3771 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
3772
3773 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3774
3775 lhs += LHS_STEP_X;
3776
3777 a0 = VLOAD(M0)(0, lhs);
3778 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
3779
3780 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3781
3782 lhs += LHS_STEP_X;
3783
3784 a0 = VLOAD(M0)(0, lhs);
3785 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
3786
3787 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3788
3789 lhs += LHS_STEP_X;
3790
3791 a0 = VLOAD(M0)(0, lhs);
3792 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
3793
3794 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3795
3796 lhs += LHS_STEP_X;
3797
3798 a0 = VLOAD(M0)(0, lhs);
3799 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
3800
3801 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3802
3803 lhs += LHS_STEP_X;
3804
3805 a0 = VLOAD(M0)(0, lhs);
3806 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
3807
3808 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3809
3810 lhs += LHS_STEP_X;
3811
3812 a0 = VLOAD(M0)(0, lhs);
3813 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
3814
3815 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3816
3817 lhs += LHS_STEP_X;
3818
3819 a0 = VLOAD(M0)(0, lhs);
3820 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
3821
3822 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3823
3824 lhs += LHS_STEP_X;
3825#endif // K0 > 8
3826
3827#ifndef LHS_INTERLEAVE
3828 lhs += (M0 * K0 * (V0 - 1));
3829#endif // LHS_INTERLEAVE
3830
3831 x_rhs += K0 * RHS_STEP_X;
3832#ifndef RHS_INTERLEAVE
3833 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
3834#endif // RHS_INTERLEAVE
3835 }
3836
3837 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3838
3839 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3840
3841#if defined(REINTERPRET_OUTPUT_AS_3D)
3842
3843 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00003844 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003845 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3846 // multiply dst_stride_z by DEPTH_GEMM3D
3847 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3848
3849#else // defined(REINTERPRET_OUTPUT_AS_3D)
3850
3851 // Add offset for batched GEMM
3852 dst_addr += z * dst_stride_z;
3853
3854#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3855
3856 // Multiply by the weight of matrix-matrix product and store the result
3857#if defined(ALPHA)
3858 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3859#endif // defined(ALPHA)
3860
3861 // Add beta*bias
3862#if defined(BETA)
3863#if defined(BROADCAST_BIAS)
3864 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3865
3866 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3867
3868#ifndef UNIT_BETA
3869 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3870#endif // UNIT_BIAS
3871
3872 // c = c + bias[broadcasted]
3873#if defined(MIXED_PRECISION)
3874 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3875 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3876#else // defined(MIXED_PRECISION)
3877 ADD_BLOCK_BROADCAST(M0, c, bias0);
3878#endif // defined(MIXED_PRECISION)
3879
3880#else // defined(BROADCAST_BIAS)
3881 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3882
3883 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3884
3885#ifndef UNIT_BETA
3886 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3887#endif // UNIT_BIAS
3888
3889#if defined(MIXED_PRECISION)
3890 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3891 ADD_BLOCK(M0, c, bias_hp);
3892#else // defined(MIXED_PRECISION)
3893 ADD_BLOCK(M0, c, bias);
3894#endif // defined(MIXED_PRECISION)
3895
3896#endif // defined(BROADCAST_BIAS)
3897#endif // defined(BETA)
3898
3899#if defined(ACTIVATION_TYPE)
3900#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003901 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003902#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003903 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003904#endif // defined(MIXED_PRECISION)
3905#endif // defined(ACTIVATION_TYPE)
3906
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003907 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3908 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3909
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003910 // Store output block
3911#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003912 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003913 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003914#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003915 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003916#endif // defined(MIXED_PRECISION)
3917
3918#undef LHS_BLOCK_SIZE
3919#undef LHS_OFFSET_X
3920#undef LHS_STEP_X
3921#undef RHS_BLOCK_SIZE
3922#undef RHS_OFFSET_X
3923#undef RHS_STEP_X
3924#undef PIXEL_UNIT
3925#undef LHS_STEP_LOOP
3926#undef RHS_STEP_LOOP
3927}
3928#endif // defined(OPENCL_IMAGE_SUPPORT)
3929
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003930#endif // defined(LHS_TRANSPOSE)
3931
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00003932#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
3933
giuros01b3204e72019-04-01 13:50:22 +01003934#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
3935
3936#define VFMA(a, b, c) \
3937 ({ \
3938 c = fma(a, b, c); \
3939 })
3940
3941#if M0 == 1
3942#define RHS_VFMA_M0xN0(i, a, b, c) \
3943 ({ \
3944 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3945 })
3946#elif M0 == 2 // M0 == 2
3947#define RHS_VFMA_M0xN0(i, a, b, c) \
3948 ({ \
3949 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3950 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3951 })
3952#elif M0 == 3 // M0 == 3
3953#define RHS_VFMA_M0xN0(i, a, b, c) \
3954 ({ \
3955 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3956 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3957 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3958 })
3959#elif M0 == 4 // M0 == 4
3960#define RHS_VFMA_M0xN0(i, a, b, c) \
3961 ({ \
3962 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3963 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3964 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3965 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3966 })
3967#elif M0 == 5 // M0 == 5
3968#define RHS_VFMA_M0xN0(i, a, b, c) \
3969 ({ \
3970 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3971 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3972 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3973 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3974 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3975 })
3976#elif M0 == 6 // M0 == 6
3977#define RHS_VFMA_M0xN0(i, a, b, c) \
3978 ({ \
3979 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3980 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3981 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3982 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3983 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3984 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3985 })
3986#elif M0 == 7 // M0 == 7
3987#define RHS_VFMA_M0xN0(i, a, b, c) \
3988 ({ \
3989 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3990 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3991 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3992 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3993 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3994 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3995 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3996 })
3997#elif M0 == 8 // M0 == 8
3998#define RHS_VFMA_M0xN0(i, a, b, c) \
3999 ({ \
4000 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
4001 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
4002 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
4003 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
4004 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
4005 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
4006 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
4007 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
4008 })
4009#else // M0 not supported
4010#error "M0 not supported"
4011#endif // M0 not supported
4012
4013/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
4014 * The LHS matrix is NOT reshaped
4015 * The RHS matrix is NOT reshaped
4016 *
4017 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004018 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
4019 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
4020 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
4021 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
4022 * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
SiCong Li3a501662020-06-26 10:02:06 +01004023 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
4024 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
giuros01b3204e72019-04-01 13:50:22 +01004025 * @note Only the following configurations of M0, N0 and K0 are currently supported:
4026 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
4027 * - N0 = 2, 3, 4, 8, 16
4028 * - K0 = 2, 3, 4, 8, 16
4029 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004030 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004031 * The activation function is performed after the bias addition
giuros01b3204e72019-04-01 13:50:22 +01004032 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
4033 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
4034 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4035 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4036 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4037 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
4038 *
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004039 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
4040 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
4041 * @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)
4042 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
4043 * @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)
4044 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
4045 * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
4046 * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
4047 * @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)
4048 * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
4049 * @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)
4050 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004051 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4052 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4053 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
4054 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4055 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
4056 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
4057 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
4058 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4059 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
4060 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4061 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
4062 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4063 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
4064 * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
4065 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
4066 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4067 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
4068 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
giuros01b3204e72019-04-01 13:50:22 +01004069 */
4070__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
4071 IMAGE_DECLARATION(rhs),
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004072#if defined(BETA)
4073 IMAGE_DECLARATION(bias),
4074#endif // defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004075 IMAGE_DECLARATION(dst),
4076 uint lhs_stride_z,
4077 uint rhs_stride_z,
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004078#if defined(BETA)
4079 uint bias_stride_z,
4080#endif //defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004081 uint dst_stride_z
4082#if defined(REINTERPRET_INPUT_AS_3D)
4083 ,
4084 uint lhs_cross_plane_pad
4085#endif // REINTERPRET_INPUT_AS_3D
4086#if defined(REINTERPRET_OUTPUT_AS_3D)
4087 ,
4088 uint dst_cross_plane_pad
4089#endif // REINTERPRET_OUTPUT_AS_3D
4090 )
4091{
4092 // Block size
4093#define RHS_BLOCK_SIZE ((K0) * (N0))
4094
4095 // RHS offset and step X
4096#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
4097
4098 uint x = get_global_id(0);
4099 uint y = get_global_id(1);
4100 uint z = get_global_id(2);
4101
4102#if defined(DUMMY_WORK_ITEMS)
4103 if((x * N0 >= N) || (y * M0 >= M))
4104 {
4105 return;
4106 }
4107#endif // defined(DUMMY_WORK_ITEMS)
4108
4109 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01004110 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
giuros01b3204e72019-04-01 13:50:22 +01004111
4112 // Compute RHS matrix address
4113 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
4114
4115#if defined(MATRIX_B_DEPTH)
4116 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4117 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
4118#else // defined(MATRIX_B_DEPTH)
4119 rhs_offset += z * rhs_stride_z;
4120#endif // defined(MATRIX_B_DEPTH)
4121
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004122 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
4123 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
giuros01b3204e72019-04-01 13:50:22 +01004124
4125#if defined(REINTERPRET_INPUT_AS_3D)
4126 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01004127 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004128
4129 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4130 // multiply lhs_stride_z by DEPTH_GEMM3D
4131 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
4132
4133#else // defined(REINTERPRET_INPUT_AS_3D)
4134
4135 // Add offset for batched GEMM
4136 lhs_offset += z * lhs_stride_z;
4137
4138#endif // defined(REINTERPRET_INPUT_AS_3D)
4139
4140 // Initialize the accumulators
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004141 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
giuros01b3204e72019-04-01 13:50:22 +01004142
4143 int i = 0;
4144 for(; i <= (K - K0); i += K0)
4145 {
4146 // Supported cases (M0, K0):
4147 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
4148 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
4149 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
4150 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
4151 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
4152 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
4153 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
4154 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
4155 // Load values from LHS matrix
4156 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
4157
4158 // Load values from RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004159 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
giuros01b3204e72019-04-01 13:50:22 +01004160
4161 RHS_VFMA_M0xN0(0, a, b0, c);
4162 RHS_VFMA_M0xN0(1, a, b1, c);
4163#if K0 > 2
4164 RHS_VFMA_M0xN0(2, a, b2, c);
4165#endif // K0 > 2
4166#if K0 > 3
4167 RHS_VFMA_M0xN0(3, a, b3, c);
4168#endif // K0 > 3
4169#if K0 > 4
4170 RHS_VFMA_M0xN0(4, a, b4, c);
4171 RHS_VFMA_M0xN0(5, a, b5, c);
4172 RHS_VFMA_M0xN0(6, a, b6, c);
4173 RHS_VFMA_M0xN0(7, a, b7, c);
4174#endif // K0 > 4
4175#if K0 > 8
4176 RHS_VFMA_M0xN0(8, a, b8, c);
4177 RHS_VFMA_M0xN0(9, a, b9, c);
Gian Marco Iodice7b9d7ca2019-09-19 16:37:39 +01004178 RHS_VFMA_M0xN0(A, a, bA, c);
4179 RHS_VFMA_M0xN0(B, a, bB, c);
4180 RHS_VFMA_M0xN0(C, a, bC, c);
4181 RHS_VFMA_M0xN0(D, a, bD, c);
4182 RHS_VFMA_M0xN0(E, a, bE, c);
4183 RHS_VFMA_M0xN0(F, a, bF, c);
giuros01b3204e72019-04-01 13:50:22 +01004184#endif // K0 > 8
4185
4186 lhs_offset += K0 * sizeof(DATA_TYPE);
4187 rhs_offset += K0 * rhs_stride_y;
4188 }
4189
4190 // Left-over accumulations
4191 for(; i < K; ++i)
4192 {
4193 // Load values from LHS matrix
4194 VEC_DATA_TYPE(DATA_TYPE, 2)
4195 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
4196#if M0 > 1
4197 VEC_DATA_TYPE(DATA_TYPE, 2)
4198 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
4199#endif // M0 > 1
4200#if M0 > 2
4201 VEC_DATA_TYPE(DATA_TYPE, 2)
4202 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
4203#endif // M0 > 2
4204#if M0 > 3
4205 VEC_DATA_TYPE(DATA_TYPE, 2)
4206 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
4207#endif // M0 > 3
4208#if M0 > 4
4209 VEC_DATA_TYPE(DATA_TYPE, 2)
4210 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
4211#endif // M0 > 4
4212#if M0 > 5
4213 VEC_DATA_TYPE(DATA_TYPE, 2)
4214 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
4215#endif // M0 > 5
4216#if M0 > 6
4217 VEC_DATA_TYPE(DATA_TYPE, 2)
4218 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
4219#endif // M0 > 6
4220#if M0 > 7
4221 VEC_DATA_TYPE(DATA_TYPE, 2)
4222 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
4223#endif // M0 > 7
4224
4225 VEC_DATA_TYPE(DATA_TYPE, N0)
4226 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
4227 RHS_VFMA_M0xN0(0, a, b, c);
4228
4229 lhs_offset += sizeof(DATA_TYPE);
4230 rhs_offset += rhs_stride_y;
4231 }
4232
SiCong Li406a13f2020-07-15 12:09:58 +01004233 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004234
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004235 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
giuros01b3204e72019-04-01 13:50:22 +01004236
4237#if defined(REINTERPRET_OUTPUT_AS_3D)
4238 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01004239 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004240
4241 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4242 // multiply dst_stride_z by DEPTH_GEMM3D
4243 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
4244
4245#else // defined(REINTERPRET_OUTPUT_AS_3D)
4246
4247 // Add offset for batched GEMM
4248 dst_addr += z * dst_stride_z;
4249
4250#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4251
4252 // Multiply by the weight of matrix-matrix product and store the result
giuros01b3204e72019-04-01 13:50:22 +01004253#if defined(ALPHA)
4254 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
4255#endif // defined(ALPHA)
4256
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004257 // Add beta*bias
4258#if defined(BETA)
4259#if defined(BROADCAST_BIAS)
4260 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
4261
4262 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4263
4264#ifndef UNIT_BETA
4265 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
4266#endif // UNIT_BIAS
4267
4268 // c = c + bias[broadcasted]
4269 ADD_BLOCK_BROADCAST(M0, c, bias0);
4270
4271#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01004272 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004273
4274 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4275
4276#ifndef UNIT_BETA
4277 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
4278#endif // UNIT_BIAS
4279
4280 // c = c + bias
4281 ADD_BLOCK(M0, c, bias);
4282
4283#endif // defined(BROADCAST_BIAS)
4284#endif // defined(BETA)
4285
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004286#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01004287 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004288#endif // defined(ACTIVATION_TYPE)
4289
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01004290 const bool cond_y = y == 0;
4291 const bool cond_x = ((x + 1) * N0 >= N);
4292
giuros01b3204e72019-04-01 13:50:22 +01004293 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01004294 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
giuros01b3204e72019-04-01 13:50:22 +01004295
4296#undef RHS_BLOCK_SIZE
4297#undef RHS_OFFSET_X
4298#undef RHS_STEP_X
4299}
4300#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
4301
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004302#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004303/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
4304 *
Gian Marco19835e52018-01-30 13:35:54 +00004305 * @note The beta's value need to be passed at compile time using -DBETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004306 *
4307 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
4308 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
4309 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4310 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
4311 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004312 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
4313 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004314 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004315 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004316 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4317 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
4318 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4319 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004320 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4321 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004322 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4323 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004324__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
4325 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004326{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004327 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004328 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
4329 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004330
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004331 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004332 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
4333
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004334 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004335 float4 c = vload4(0, (__global float *)src.ptr);
4336
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004337 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004338 float4 out = alpha_ab + (float4)BETA * c;
4339
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004340 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004341 vstore4(out, 0, (__global float *)dst.ptr);
4342}
4343
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01004344#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004345/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
4346 *
Gian Marco19835e52018-01-30 13:35:54 +00004347 * @note The beta's value need to be passed at compile time using -DBETA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004348 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004349 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
4350 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
4351 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4352 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
4353 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004354 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
4355 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004356 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004357 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004358 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4359 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
4360 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4361 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004362 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4363 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004364 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4365 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004366__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
4367 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004368{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004369 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004370 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
4371 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004372
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004373 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004374 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
4375
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004376 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004377 half8 c = vload8(0, (__global half *)src.ptr);
4378
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004379 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004380 half8 out = alpha_ab + (half8)BETA * c;
4381
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004382 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004383 vstore8(out, 0, (__global half *)dst.ptr);
4384}
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01004385#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Georgios Pinitas96b16b62020-12-01 17:41:34 +00004386#endif // defined(BETA)