blob: 6883aafee5647487b557e67742db96cb18a905fc [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Usama Arif0681e3b2019-04-25 14:28:07 +010024#include "gemm_helpers.h"
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +000025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010027#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodiceb87b95e2019-01-21 17:14:31 +000028#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
29#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
30#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
31#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
32#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
33#define CONCAT_INC(K0) INC##K0
34#define INC(K0) CONCAT_INC(K0)
35
36#if(SRC_WIDTH % K0)
37#define BOUNDARY_CONDITION_X(x, a) \
38 ({ \
39 a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
40 })
41#else // (SRC_WIDTH % K0)
42#define BOUNDARY_CONDITION_X(x, a) \
43 ({})
44#endif // (SRC_WIDTH % K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000045
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010046#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
47 ({ \
48 if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
49 { \
50 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
51 { \
52 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
53 } \
54 else \
55 { \
56 LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
57 } \
58 } \
59 else \
60 { \
61 if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
62 { \
63 LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
64 } \
65 else \
66 { \
67 LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
68 } \
69 } \
70 })
71
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000072/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
73 * the output matrix unrolling the values.
74 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010075 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
76 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010077 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010078 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
79 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +010080 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
81 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000082 * @note Only the following values for M0, K0 and V0 are supported:
83 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +000084 * K0: 2,3,4,8,16
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000085 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +010086 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000087 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
88 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
89 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
90 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
91 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
92 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010093 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +000094 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
95 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
96 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
97 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
98 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
99 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
100 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
101 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
102 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
103 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
104 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
105 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
106 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
107 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
108 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
109 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
110 */
111__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
112 TENSOR3D_DECLARATION(dst)
113#if defined(REINTERPRET_INPUT_AS_3D)
114 ,
115 uint cross_plane_pad
116#endif // REINTERPRET_INPUT_AS_3D
117 )
118{
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000119 // Block size
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000120#define BLOCK_SIZE ((M0) * (K0))
121
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000122 // Output offset X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000123#if defined(INTERLEAVE)
124#define OUTPUT_OFFSET_X (K0)
125#else // defined(INTERLEAVE)
126#define OUTPUT_OFFSET_X (BLOCK_SIZE)
127#endif // defined(INTERLEAVE)
128
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000129 // Output step X
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000130#if defined(INTERLEAVE)
131#define OUTPUT_STEP_X (K0) * (V0)
132#else // Do not interleave
133#define OUTPUT_STEP_X (K0)
134#endif // defined(INTERLEAVE)
135
136 // Compute source and destination addresses
137 uint x = get_global_id(0);
138 uint y = get_global_id(1);
139 uint z = get_global_id(2);
140
141 // ------------------ Compute input/output addresses ---------------------------
142
143 // Compute the input address
144 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
145
146 // Compute the output address
147 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
148 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
149
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000150 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
151 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000152
153#if defined(REINTERPRET_INPUT_AS_3D)
154 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
155 // multiply src_stride_z by DEPTH_GEMM3D
156
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000157 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
158
159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100160 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000161
162#else // defined(REINTERPRET_INPUT_AS_3D)
163
164 input_ptr += z * (uint)src_stride_z;
165
166#endif // defined(REINTERPRET_INPUT_AS_3D)
167
168 // Add offset for batched GEMM
169 output_ptr += z * (uint)dst_stride_z;
170
171 // ---------------------------Load input values --------------------------------
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000172 // Load values from the LHS matrix
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100173 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
174
175 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
176
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000177 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100178 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
179 STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000180
181#undef BLOCK_SIZE
182#undef OUTPUT_OFFSET_X
183#undef OUTPUT_STEP_X
184}
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000185
186#if M0 == 2
187#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
188 ({ \
189 VEC_DATA_TYPE(DATA_TYPE, M0) \
190 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
191 VSTORE(M0) \
192 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
193 })
194#elif M0 == 3 // M0 == 3
195#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
196 ({ \
197 VEC_DATA_TYPE(DATA_TYPE, M0) \
198 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
199 VSTORE(M0) \
200 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
201 })
202#elif M0 == 4 // M0 == 4
203#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
204 ({ \
205 VEC_DATA_TYPE(DATA_TYPE, M0) \
206 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
207 VSTORE(M0) \
208 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
209 })
210#elif M0 == 5 // M0 == 5
211#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
212 ({ \
213 VEC_DATA_TYPE(DATA_TYPE, 4) \
214 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
215 DATA_TYPE res1 = a4.s##i; \
216 VSTORE(4) \
217 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
218 *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
219 })
220#elif M0 == 6 // M0 == 6
221#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
222 ({ \
223 VEC_DATA_TYPE(DATA_TYPE, 4) \
224 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
225 VEC_DATA_TYPE(DATA_TYPE, 2) \
226 res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
227 VSTORE(4) \
228 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
229 VSTORE(2) \
230 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
231 })
232#elif M0 == 7 // M0 == 7
233#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
234 ({ \
235 VEC_DATA_TYPE(DATA_TYPE, 4) \
236 res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
237 VEC_DATA_TYPE(DATA_TYPE, 3) \
238 res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
239 VSTORE(4) \
240 (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
241 VSTORE(3) \
242 (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
243 })
244#elif M0 == 8 // M0 == 8
245#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
246 ({ \
247 VEC_DATA_TYPE(DATA_TYPE, M0) \
248 res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
249 VSTORE(M0) \
250 (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
251 })
252#else // M0 not supported
253#error "M0 value not supported"
254#endif // N0 conditions
255
256/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
257 * the output matrix unrolling the values.
258 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100259 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
260 * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100261 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100262 * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
263 * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100264 * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
265 * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000266 * @note Only the following values for M0, K0 and V0 are supported:
267 * M0: 2,3,4,5,6,7,8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000268 * K0: 2,3,4,8,16
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000269 * V0: greater than 0
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100270 * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000271 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
272 * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
273 * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
274 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
275 * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
276 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100277 * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000278 * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
279 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
281 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
283 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
284 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
285 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
286 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
287 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
289 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
291 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
292 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
293 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
294 */
295__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
296 TENSOR3D_DECLARATION(dst)
297#if defined(REINTERPRET_INPUT_AS_3D)
298 ,
299 uint cross_plane_pad
300#endif // REINTERPRET_INPUT_AS_3D
301 )
302{
303 // Block size
304#define BLOCK_SIZE ((M0) * (K0))
305
306 // Output offset X
307#if defined(INTERLEAVE)
308#define OUTPUT_OFFSET_X (M0)
309#else // defined(INTERLEAVE)
310#define OUTPUT_OFFSET_X (BLOCK_SIZE)
311#endif // defined(INTERLEAVE)
312
313 // Output step X
314#if defined(INTERLEAVE)
315#define OUTPUT_STEP_X (M0) * (V0)
316#else // Do not interleave
317#define OUTPUT_STEP_X (M0)
318#endif // defined(INTERLEAVE)
319
320 // Compute source and destination addresses
321 uint x = get_global_id(0);
322 uint y = get_global_id(1);
323 uint z = get_global_id(2);
324
325 // ------------------ Compute input/output addresses ---------------------------
326
327 // Compute the input address
328 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
329
330 // Compute the output address
331 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
332 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
333
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000334 // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
335 REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000336
337#if defined(REINTERPRET_INPUT_AS_3D)
338 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
339 // multiply src_stride_z by DEPTH_GEMM3D
340
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000341 input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
342
343 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Usama Arif0681e3b2019-04-25 14:28:07 +0100344 CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000345
346#else // defined(REINTERPRET_INPUT_AS_3D)
347
348 input_ptr += z * (uint)src_stride_z;
349
350#endif // defined(REINTERPRET_INPUT_AS_3D)
351
352 // Add offset for batched GEMM
353 output_ptr += z * (uint)dst_stride_z;
354
355 // ---------------------------Load input values --------------------------------
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100356 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000357
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100358 LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
359
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000360 // ---------------------------Transpose and store block -----------------------
361
362 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
363 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
364#if K0 > 2
365 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000366#endif // K0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000367#if K0 > 3
368 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
369#endif // K0 > 3
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000370#if K0 > 4
371 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
372 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
373 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
374 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
375#endif // K0 > 4
376#if K0 > 8
377 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
378 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
379 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
380 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
381 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
382 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
383 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
384 TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
385#endif // K0 > 8
386
387#undef BLOCK_SIZE
388#undef OUTPUT_OFFSET_X
389#undef OUTPUT_STEP_X
390}
Gian Marco Iodice73cdaac2020-08-10 21:44:14 +0100391#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
Gian Marco Iodice5ba5e092018-12-06 17:13:09 +0000392
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000393#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
394/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
395 * the output matrix unrolling the values.
396 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100397 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
398 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
399 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
400 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000401 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
402 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000403 * N0: 2,3,4,8,16
404 * K0: 1,2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000405 * H0: greater than 0
406 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100407 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000408 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
409 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
410 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
411 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
412 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
413 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
414 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
415 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
416 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
417 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
419 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
421 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
423 */
424__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
425 TENSOR3D_DECLARATION(dst))
426{
427 // Block size
428#define BLOCK_SIZE ((K0) * (N0))
429
430 // Output offset X
431#if defined(INTERLEAVE)
432#define OUTPUT_OFFSET_X (N0)
433#else // defined(INTERLEAVE)
434#define OUTPUT_OFFSET_X (BLOCK_SIZE)
435#endif // defined(INTERLEAVE)
436
437 // Output step X
438#if defined(INTERLEAVE)
439#define OUTPUT_STEP_X (N0) * (H0)
440#else // Do not interleave
441#define OUTPUT_STEP_X (N0)
442#endif // defined(INTERLEAVE)
443
444 // Compute source and destination addresses
445 uint x = get_global_id(0);
446 uint y = get_global_id(1);
447 uint z = get_global_id(2);
448
449 // ------------------ Compute input/output addresses ---------------------------
450
451 // Compute the input address
452 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
453
454 // Compute the output address
455 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
456 x / (uint)H0)
457 * (uint)dst_stride_y)
458 + z * (uint)dst_stride_z;
459
460 // ---------------------------Load input values --------------------------------
461
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000462 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000463
464 // Load values from the RHS matrix
465 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
466#if K0 > 1
467 if(y * (uint)K0 + 1 < SRC_HEIGHT)
468 {
469 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
470 }
471#endif // K0 > 1
472#if K0 > 2
473 if(y * (uint)K0 + 2 < SRC_HEIGHT)
474 {
475 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
476 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000477#endif // K0 > 2
478#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000479 if(y * (uint)K0 + 3 < SRC_HEIGHT)
480 {
481 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
482 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000483#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000484#if K0 > 4
485 if(y * (uint)K0 + 4 < SRC_HEIGHT)
486 {
487 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
488 }
489 if(y * (uint)K0 + 5 < SRC_HEIGHT)
490 {
491 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
492 }
493 if(y * (uint)K0 + 6 < SRC_HEIGHT)
494 {
495 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
496 }
497 if(y * (uint)K0 + 7 < SRC_HEIGHT)
498 {
499 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
500 }
501#endif // K0 > 4
502#if K0 > 8
Gian Marco Iodice08ddd7b2018-12-19 10:01:18 +0000503 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000504 {
505 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
506 }
507 if(y * (uint)K0 + 9 < SRC_HEIGHT)
508 {
509 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
510 }
511 if(y * (uint)K0 + 10 < SRC_HEIGHT)
512 {
513 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
514 }
515 if(y * (uint)K0 + 11 < SRC_HEIGHT)
516 {
517 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
518 }
519 if(y * (uint)K0 + 12 < SRC_HEIGHT)
520 {
521 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
522 }
523 if(y * (uint)K0 + 13 < SRC_HEIGHT)
524 {
525 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
526 }
527 if(y * (uint)K0 + 14 < SRC_HEIGHT)
528 {
529 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
530 }
531 if(y * (uint)K0 + 15 < SRC_HEIGHT)
532 {
533 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
534 }
535#endif // K0 > 8
536
537 // ---------------------------Store output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100538 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
539 STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000540
541#undef BLOCK_SIZE
542#undef OUTPUT_OFFSET_X
543#undef OUTPUT_STEP_X
544}
545
546#if defined(TRANSPOSE)
547/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
548 * the output matrix unrolling the values.
549 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +0100550 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
551 * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
552 * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
553 * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000554 * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
555 * @note The option -DTRANSPOSE must passed at compile time.
556 * @note Only the following values for K0, N0 and H0 are supported:
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000557 * N0: 2,3,4,8,16
558 * K0: 2,3,4,8,16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000559 * H0: greater than 0
560 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100561 * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000562 * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
563 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
564 * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
565 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
566 * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
567 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
568 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
569 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
574 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
575 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
576 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
577 */
578__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
579 TENSOR3D_DECLARATION(dst))
580{
581 // Block size
582#define BLOCK_SIZE ((K0) * (N0))
583
584 // Output offset X
585#if defined(INTERLEAVE)
586#define OUTPUT_OFFSET_X (K0)
587#else // defined(INTERLEAVE)
588#define OUTPUT_OFFSET_X (BLOCK_SIZE)
589#endif // defined(INTERLEAVE)
590
591 // Output step X
592#if defined(INTERLEAVE)
593#define OUTPUT_STEP_X (K0) * (H0)
594#else // Do not interleave
595#define OUTPUT_STEP_X (K0)
596#endif // defined(INTERLEAVE)
597
598 // Compute source and destination addresses
599 uint x = get_global_id(0);
600 uint y = get_global_id(1);
601 uint z = get_global_id(2);
602
603 // ------------------ Compute input/output addresses ---------------------------
604
605 // Compute the input address
606 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
607
608 // Compute the output address
609 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
610 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
611
612 // ---------------------------Load input values --------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000613 REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000614
615 // Load values from the RHS matrix
616 a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
617 if(y * (uint)K0 + 1 < SRC_HEIGHT)
618 {
619 a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
620 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000621#if K0 > 2
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000622 if(y * (uint)K0 + 2 < SRC_HEIGHT)
623 {
624 a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
625 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000626#endif // K0 > 2
627#if K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000628 if(y * (uint)K0 + 3 < SRC_HEIGHT)
629 {
630 a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
631 }
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000632#endif // K0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000633#if K0 > 4
634 if(y * (uint)K0 + 4 < SRC_HEIGHT)
635 {
636 a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
637 }
638 if(y * (uint)K0 + 5 < SRC_HEIGHT)
639 {
640 a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
641 }
642 if(y * (uint)K0 + 6 < SRC_HEIGHT)
643 {
644 a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
645 }
646 if(y * (uint)K0 + 7 < SRC_HEIGHT)
647 {
648 a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
649 }
650#endif // K0 > 4
651#if K0 > 8
Gian Marco Iodice89124342018-12-19 14:17:22 +0000652 if(y * (uint)K0 + 8 < SRC_HEIGHT)
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000653 {
654 a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
655 }
656 if(y * (uint)K0 + 9 < SRC_HEIGHT)
657 {
658 a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
659 }
660 if(y * (uint)K0 + 10 < SRC_HEIGHT)
661 {
662 aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
663 }
664 if(y * (uint)K0 + 11 < SRC_HEIGHT)
665 {
666 aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
667 }
668 if(y * (uint)K0 + 12 < SRC_HEIGHT)
669 {
670 aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
671 }
672 if(y * (uint)K0 + 13 < SRC_HEIGHT)
673 {
674 aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
675 }
676 if(y * (uint)K0 + 14 < SRC_HEIGHT)
677 {
678 aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
679 }
680 if(y * (uint)K0 + 15 < SRC_HEIGHT)
681 {
682 aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
683 }
684#endif // K0 > 8
685
686 // ---------------------------Transpose the block ------------------------------
Vidhya Sudhan Loganathan17b0f8b2019-01-08 12:17:03 +0000687 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000688
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000689#if K0 == 2
690 // This part computes the following transpositions:
691 // 2x2 -> 2x2
692 // 2x4 -> 4x2
693 // 2x8 -> 8x2
694 // 2x16 -> 16x2
695 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
696 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
697#if N0 > 2
698 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
699#endif // N0 > 2
700#if N0 > 3
701 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
702#endif // N0 > 3
703#if N0 > 4
704 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
705 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
706 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
707 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
708#endif // N0 > 4
709#if N0 > 8
710 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
711 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
712 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
713 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
714 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
715 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
716 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
717 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
718#endif // N0 > 8
719
720#elif K0 == 3 // K0 == 2
721 // This part computes the following transpositions:
722 // 3x2 -> 2x3
723 // 3x4 -> 4x3
724 // 3x8 -> 8x3
725 // 3x16 -> 16x3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100726 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
727 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000728#if N0 > 2
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100729 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000730#endif // N0 > 2
731#if N0 > 3
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100732 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000733#endif // N0 > 3
734#if N0 > 4
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100735 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
736 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
737 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
738 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000739#endif // N0 > 4
740#if N0 > 8
Georgios Pinitasb0f342e2019-05-21 13:32:43 +0100741 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
742 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
743 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
744 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
745 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
746 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
747 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
748 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000749#endif // N0 > 8
750
751#elif K0 == 4 // K0 == 4
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000752 // This part computes the following transpositions:
753 // 4x2 -> 2x4
754 // 4x4 -> 4x4
755 // 4x8 -> 8x4
756 // 4x16 -> 16x4
757 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
758 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
759#if N0 > 2
760 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000761#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000762#if N0 > 3
763 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
764#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000765#if N0 > 4
766 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
767 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
768 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
769 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
770#endif // N0 > 4
771#if N0 > 8
772 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
773 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
774 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
775 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
776 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
777 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
778 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
779 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
780#endif // N0 > 8
781
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000782#elif K0 == 8 // K0 == 8
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000783 // This part computes the following transpositions:
784 // 8x2 -> 2x8
785 // 8x4 -> 4x8
786 // 8x8 -> 8x8
787 // 8x16 -> 16x8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000788 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
789 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000790#if N0 > 2
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000791 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000792#endif // N0 > 2
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000793#if N0 > 3
794 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
795#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000796#if N0 > 4
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000797 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
798 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
799 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
800 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000801#endif // N0 > 4
802#if N0 > 8
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +0000803 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
804 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
805 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
806 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
807 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
808 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
809 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
810 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000811#endif // N0 > 8
812
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000813#elif K0 == 16 // K0 == 16
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000814
815 // This part computes the following transpositions:
816 // 16x2 -> 2x16
817 // 16x4 -> 4x16
818 // 16x8 -> 8x16
819 // 16x16 -> 16x16
820 res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
821 a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
822 res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
823 a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
824#if N0 > 2
825 res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
826 a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000827#endif // N0 > 2
828#if N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000829 res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
830 a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
Gian Marco Iodicebacfec52019-01-11 11:30:55 +0000831#endif // N0 > 3
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000832#if N0 > 4
833 res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
834 a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
835 res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
836 a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
837 res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
838 a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
839 res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
840 a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
841#endif // N0 > 4
842#if N0 > 8
843 res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
844 a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
845 res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
846 a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
847 resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
848 a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
849 resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
850 a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
851 resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
852 a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
853 resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
854 a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
855 resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
856 a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
857 resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
858 a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
859#endif // N0 > 8
860
861#else // N0 == 16
862#error "Not supported N0 value"
863#endif // N0 > 2
864
865 // ---------------------------Store the output values ------------------------------
Usama Arif0681e3b2019-04-25 14:28:07 +0100866 REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
867 STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
Gian Marco Iodice3b0a2652018-12-07 11:18:09 +0000868
869#undef BLOCK_SIZE
870#undef OUTPUT_OFFSET_X
871#undef OUTPUT_STEP_X
872}
873#endif // defined(TRANSPOSE)
874#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
875
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000876#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +0000877
878#define CONCAT(a, b) a##b
879
880#define ARM_DOT1(a, b, c) \
881 ({ \
882 c = fma(a, b, c); \
883 })
884#define ARM_DOT2(a, b, c) \
885 ({ \
886 c = fma(a.s0, b.s0, c); \
887 c = fma(a.s1, b.s1, c); \
888 })
889#define ARM_DOT3(a, b, c) \
890 ({ \
891 ARM_DOT2(a, b, c); \
892 c = fma((a.s2), (b.s2), c); \
893 })
894#define ARM_DOT4(a, b, c) \
895 ({ \
896 ARM_DOT3(a, b, c); \
897 c = fma((a.s3), (b.s3), c); \
898 })
899#define ARM_DOT8(a, b, c) \
900 ({ \
901 ARM_DOT4((a.lo), (b.lo), c); \
902 ARM_DOT4((a.hi), (b.hi), c); \
903 })
904#define ARM_DOT16(a, b, c) \
905 ({ \
906 ARM_DOT8((a.lo), (b.lo), c); \
907 ARM_DOT8((a.hi), (b.hi), c); \
908 })
909
910#if N0 == 2
911#define ARM_DOT_K0XN0(k0, a, b, c) \
912 ({ \
913 CONCAT(ARM_DOT, k0) \
914 ((a), (b##0), (c.s0)); \
915 CONCAT(ARM_DOT, k0) \
916 ((a), (b##1), (c.s1)); \
917 })
918#elif N0 == 3 // N0 == 3
919#define ARM_DOT_K0XN0(k0, a, b, c) \
920 ({ \
921 CONCAT(ARM_DOT, k0) \
922 ((a), (b##0), (c.s0)); \
923 CONCAT(ARM_DOT, k0) \
924 ((a), (b##1), (c.s1)); \
925 CONCAT(ARM_DOT, k0) \
926 ((a), (b##2), (c.s2)); \
927 })
928#elif N0 == 4 // N0 == 4
929#define ARM_DOT_K0XN0(k0, a, b, c) \
930 ({ \
931 CONCAT(ARM_DOT, k0) \
932 ((a), (b##0), (c.s0)); \
933 CONCAT(ARM_DOT, k0) \
934 ((a), (b##1), (c.s1)); \
935 CONCAT(ARM_DOT, k0) \
936 ((a), (b##2), (c.s2)); \
937 CONCAT(ARM_DOT, k0) \
938 ((a), (b##3), (c.s3)); \
939 })
940#elif N0 == 8 // N0 == 8
941#define ARM_DOT_K0XN0(k0, a, b, c) \
942 ({ \
943 CONCAT(ARM_DOT, k0) \
944 ((a), (b##0), (c.s0)); \
945 CONCAT(ARM_DOT, k0) \
946 ((a), (b##1), (c.s1)); \
947 CONCAT(ARM_DOT, k0) \
948 ((a), (b##2), (c.s2)); \
949 CONCAT(ARM_DOT, k0) \
950 ((a), (b##3), (c.s3)); \
951 CONCAT(ARM_DOT, k0) \
952 ((a), (b##4), (c.s4)); \
953 CONCAT(ARM_DOT, k0) \
954 ((a), (b##5), (c.s5)); \
955 CONCAT(ARM_DOT, k0) \
956 ((a), (b##6), (c.s6)); \
957 CONCAT(ARM_DOT, k0) \
958 ((a), (b##7), (c.s7)); \
959 })
960#elif N0 == 16 // N0 == 16
961#define ARM_DOT_K0XN0(k0, a, b, c) \
962 ({ \
963 CONCAT(ARM_DOT, k0) \
964 ((a), (b##0), (c.s0)); \
965 CONCAT(ARM_DOT, k0) \
966 ((a), (b##1), (c.s1)); \
967 CONCAT(ARM_DOT, k0) \
968 ((a), (b##2), (c.s2)); \
969 CONCAT(ARM_DOT, k0) \
970 ((a), (b##3), (c.s3)); \
971 CONCAT(ARM_DOT, k0) \
972 ((a), (b##4), (c.s4)); \
973 CONCAT(ARM_DOT, k0) \
974 ((a), (b##5), (c.s5)); \
975 CONCAT(ARM_DOT, k0) \
976 ((a), (b##6), (c.s6)); \
977 CONCAT(ARM_DOT, k0) \
978 ((a), (b##7), (c.s7)); \
979 CONCAT(ARM_DOT, k0) \
980 ((a), (b##8), (c.s8)); \
981 CONCAT(ARM_DOT, k0) \
982 ((a), (b##9), (c.s9)); \
983 CONCAT(ARM_DOT, k0) \
984 ((a), (b##A), (c.sA)); \
985 CONCAT(ARM_DOT, k0) \
986 ((a), (b##B), (c.sB)); \
987 CONCAT(ARM_DOT, k0) \
988 ((a), (b##C), (c.sC)); \
989 CONCAT(ARM_DOT, k0) \
990 ((a), (b##D), (c.sD)); \
991 CONCAT(ARM_DOT, k0) \
992 ((a), (b##E), (c.sE)); \
993 CONCAT(ARM_DOT, k0) \
994 ((a), (b##F), (c.sF)); \
995 })
996#else // N0 not supported
997#error "N0 value not supported"
998#endif // N0 conditions
999
1000/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1001 * The LHS matrix is NOT reshaped
1002 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1003 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001004 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001005 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1006 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
1007 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1008 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1009 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001010 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001011 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1012 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001013 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1014 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1015 * - N0 = 2, 3, 4, 8, 16
1016 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001017 * - H0 >= 1
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001018 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001019 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001020 * The activation function is performed after the bias addition
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001021 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1022 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1023 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1024 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1025 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1026 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1027 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001028 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1029 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001030 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001031 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001032 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001033 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001034 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1035 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1036 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1037 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1038 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1039 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001040 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1041 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1042 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1044 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001046 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1047 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1048 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1050 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001052 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001053 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001054 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001055 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1056 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1057 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001058 */
1059__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1060 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001061#if defined(BETA)
1062 IMAGE_DECLARATION(bias),
1063#endif // defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001064 IMAGE_DECLARATION(dst),
1065 uint lhs_stride_z,
1066 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001067#if defined(BETA)
1068 uint bias_stride_z,
1069#endif //defined(BETA)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001070 uint dst_stride_z
1071#if defined(REINTERPRET_INPUT_AS_3D)
1072 ,
1073 uint lhs_cross_plane_pad
1074#endif // REINTERPRET_INPUT_AS_3D
1075#if defined(REINTERPRET_OUTPUT_AS_3D)
1076 ,
1077 uint dst_cross_plane_pad
1078#endif // REINTERPRET_OUTPUT_AS_3D
1079 )
1080{
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001081 // Block size
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001082#define RHS_BLOCK_SIZE ((K0) * (N0))
1083
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001084 // RHS offset and step X
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001085#if defined(RHS_INTERLEAVE)
1086#define RHS_OFFSET_X (K0)
1087#define RHS_STEP_X ((K0) * (H0))
1088#define RHS_STEP_LOOP (1)
1089#else // defined(RHS_INTERLEAVE)
1090#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1091#define RHS_STEP_X (K0)
1092#define RHS_STEP_LOOP (H0)
1093#endif // defined(RHS_INTERLEAVE)
1094
1095 uint x = get_global_id(0);
1096 uint y = get_global_id(1);
1097 uint z = get_global_id(2);
1098
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001099#if defined(DUMMY_WORK_ITEMS)
1100 if((x * N0 >= N) || (y * M0 >= M))
1101 {
1102 return;
1103 }
1104#endif // defined(DUMMY_WORK_ITEMS)
1105
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001106 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001107 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001108
Sheri Zhang1a378102020-04-30 12:59:39 +01001109 // Compute RHS reshaped matrix address
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001110 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1111
1112#if defined(MATRIX_B_DEPTH)
1113 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1114 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1115#else // defined(MATRIX_B_DEPTH)
1116 rhs_offset += z * rhs_stride_z;
1117#endif // defined(MATRIX_B_DEPTH)
1118
Usama Arif0681e3b2019-04-25 14:28:07 +01001119 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001120 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001121
1122#if defined(REINTERPRET_INPUT_AS_3D)
Usama Arif0681e3b2019-04-25 14:28:07 +01001123 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001124 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001125
1126 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1127 // multiply lhs_stride_z by DEPTH_GEMM3D
1128 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1129
1130#else // defined(REINTERPRET_INPUT_AS_3D)
1131
1132 // Add offset for batched GEMM
1133 lhs_offset += z * lhs_stride_z;
1134
1135#endif // defined(REINTERPRET_INPUT_AS_3D)
1136
1137 // Initialize the accumulators
1138 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1139
1140 int i = 0;
1141 for(; i <= (K - K0); i += K0)
1142 {
1143 // Supported cases (M0, K0):
1144 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1145 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1146 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1147 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1148 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1149 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1150 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1151 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1152 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001153 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001154
Sheri Zhang1a378102020-04-30 12:59:39 +01001155 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001156 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001157
1158 // Accumulate
1159 ARM_DOT_K0XN0(K0, a0, b, c0);
1160#if M0 > 1
1161 ARM_DOT_K0XN0(K0, a1, b, c1);
1162#endif // M0 > 1
1163#if M0 > 2
1164 ARM_DOT_K0XN0(K0, a2, b, c2);
1165#endif // M0 > 2
1166#if M0 > 3
1167 ARM_DOT_K0XN0(K0, a3, b, c3);
1168#endif // M0 > 3
1169#if M0 > 4
1170 ARM_DOT_K0XN0(K0, a4, b, c4);
1171#endif // M0 > 4
1172#if M0 > 5
1173 ARM_DOT_K0XN0(K0, a5, b, c5);
1174#endif // M0 > 5
1175#if M0 > 6
1176 ARM_DOT_K0XN0(K0, a6, b, c6);
1177#endif // M0 > 6
1178#if M0 > 7
1179 ARM_DOT_K0XN0(K0, a7, b, c7);
1180#endif // M0 > 7
1181
1182 lhs_offset += K0 * sizeof(DATA_TYPE);
1183 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
1184 }
1185
1186 // Left-over accumulations
1187 for(; i < K; ++i)
1188 {
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001189 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001190 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001191
Sheri Zhang1a378102020-04-30 12:59:39 +01001192 // Load values from RHS reshaped matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001193 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001194
1195 // Accumulate
1196 ARM_DOT_K0XN0(1, a0, b, c0);
1197#if M0 > 1
1198 ARM_DOT_K0XN0(1, a1, b, c1);
1199#endif // M0 > 1
1200#if M0 > 2
1201 ARM_DOT_K0XN0(1, a2, b, c2);
1202#endif // M0 > 2
1203#if M0 > 3
1204 ARM_DOT_K0XN0(1, a3, b, c3);
1205#endif // M0 > 3
1206#if M0 > 4
1207 ARM_DOT_K0XN0(1, a4, b, c4);
1208#endif // M0 > 4
1209#if M0 > 5
1210 ARM_DOT_K0XN0(1, a5, b, c5);
1211#endif // M0 > 5
1212#if M0 > 6
1213 ARM_DOT_K0XN0(1, a6, b, c6);
1214#endif // M0 > 6
1215#if M0 > 7
1216 ARM_DOT_K0XN0(1, a7, b, c7);
1217#endif // M0 > 7
1218
1219 lhs_offset += sizeof(DATA_TYPE);
1220 rhs_offset += sizeof(DATA_TYPE);
1221 }
1222
SiCong Li406a13f2020-07-15 12:09:58 +01001223 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001224
1225 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1226
1227#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001228
1229 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001230 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001231
1232 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1233 // multiply dst_stride_z by DEPTH_GEMM3D
1234 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1235
1236#else // defined(REINTERPRET_OUTPUT_AS_3D)
1237
1238 // Add offset for batched GEMM
1239 dst_addr += z * dst_stride_z;
1240
1241#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1242
1243 // Multiply by the weight of matrix-matrix product and store the result
1244#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001245 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001246#endif // defined(ALPHA)
1247
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001248 // Add beta*bias
1249#if defined(BETA)
1250#if defined(BROADCAST_BIAS)
1251 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1252
1253 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1254
1255#ifndef UNIT_BETA
1256 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1257#endif // UNIT_BIAS
1258
1259 // c = c + bias[broadcasted]
1260 ADD_BLOCK_BROADCAST(M0, c, bias0);
1261
1262#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001263 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001264
1265 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1266
1267#ifndef UNIT_BETA
1268 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1269#endif // UNIT_BIAS
1270
1271 // c = c + bias
1272 ADD_BLOCK(M0, c, bias);
1273
1274#endif // defined(BROADCAST_BIAS)
1275#endif // defined(BETA)
1276
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001277#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001278 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001279#endif // defined(ACTIVATION_TYPE)
1280
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001281 const bool cond_y = y == 0;
1282 const bool cond_x = ((x + 1) * N0 >= N);
1283
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001284 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001285 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00001286
1287#undef RHS_BLOCK_SIZE
1288#undef RHS_OFFSET_X
1289#undef RHS_STEP_X
1290}
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001291
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001292#if defined(OPENCL_IMAGE_SUPPORT)
1293/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
1294 * The LHS matrix is NOT reshaped
1295 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
1296 *
1297 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
1298 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1299 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
1300 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
1301 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
1302 * could be different from the value returned by get_image_height(rhs_img).
1303 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1304 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1305 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
1306 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001307 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1308 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001309 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1310 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1311 * - N0 = 4, 8, 16
1312 * - K0 = 4, 8, 16
1313 * - H0 >= 1
1314 *
1315 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
1316 * The activation function is performed after the bias addition
1317 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1318 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1319 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1320 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1321 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1322 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1323 *
1324 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
1325 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
1326 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1327 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
1328 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1329 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
1330 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
1331 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1332 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
1333 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
1334 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
1335 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1336 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1337 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1338 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1339 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1340 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1341 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1342 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1343 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
1344 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1345 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
1346 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1347 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1348 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1349 */
1350__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
1351 __read_only image2d_t rhs_img,
1352#if defined(BETA)
1353 IMAGE_DECLARATION(bias),
1354#endif // defined(BETA)
1355 IMAGE_DECLARATION(dst),
1356 uint lhs_stride_z,
1357 uint rhs_stride_z,
1358#if defined(BETA)
1359 uint bias_stride_z,
1360#endif //defined(BETA)
1361 uint dst_stride_z
1362#if defined(REINTERPRET_INPUT_AS_3D)
1363 ,
1364 uint lhs_cross_plane_pad
1365#endif // REINTERPRET_INPUT_AS_3D
1366#if defined(REINTERPRET_OUTPUT_AS_3D)
1367 ,
1368 uint dst_cross_plane_pad
1369#endif // REINTERPRET_OUTPUT_AS_3D
1370 )
1371{
1372 // Pixel unit
1373#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
1374
1375#define LEFTOVER_K (K % K0)
1376
1377 // Block size
1378#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
1379
1380 // RHS offset and step X
1381#if defined(RHS_INTERLEAVE)
1382#define RHS_OFFSET_X (PIXEL_UNIT)
1383#define RHS_STEP_X (PIXEL_UNIT * (H0))
1384#define RHS_STEP_LOOP (1)
1385#else // defined(RHS_INTERLEAVE)
1386#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1387#define RHS_STEP_X PIXEL_UNIT
1388#define RHS_STEP_LOOP (H0)
1389#endif // defined(RHS_INTERLEAVE)
1390
1391 uint x = get_global_id(0);
1392 uint y = get_global_id(1);
1393 uint z = get_global_id(2);
1394
1395#if defined(DUMMY_WORK_ITEMS)
1396 if((x * N0 >= N) || (y * M0 >= M))
1397 {
1398 return;
1399 }
1400#endif // defined(DUMMY_WORK_ITEMS)
1401
1402 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001403 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001404
1405#if defined(MATRIX_B_DEPTH)
1406 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1407 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
1408#else // defined(MATRIX_B_DEPTH)
1409 const uint z_rhs = get_global_id(2);
1410#endif // defined(MATRIX_B_DEPTH)
1411
1412 // Compute RHS matrix coordinates
1413 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
1414 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
1415
1416 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
1417 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
1418
1419#if defined(REINTERPRET_INPUT_AS_3D)
1420 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001421 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001422
1423 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1424 // multiply lhs_stride_z by DEPTH_GEMM3D
1425 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1426
1427#else // defined(REINTERPRET_INPUT_AS_3D)
1428
1429 // Add offset for batched GEMM
1430 lhs_offset += z * lhs_stride_z;
1431
1432#endif // defined(REINTERPRET_INPUT_AS_3D)
1433
1434 // Initialize the accumulators
1435 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
1436
1437 int i = 0;
1438 for(; i <= (K - K0); i += K0)
1439 {
1440 // Load values from LHS matrix
1441 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1442
1443 // Load values from RHS matrix stored in a cl_image
1444 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1445 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1446
1447 // Accumulate
1448 ARM_DOT_K0XN0(K0, a0, b, c0);
1449#if M0 > 1
1450 ARM_DOT_K0XN0(K0, a1, b, c1);
1451#endif // M0 > 1
1452#if M0 > 2
1453 ARM_DOT_K0XN0(K0, a2, b, c2);
1454#endif // M0 > 2
1455#if M0 > 3
1456 ARM_DOT_K0XN0(K0, a3, b, c3);
1457#endif // M0 > 3
1458#if M0 > 4
1459 ARM_DOT_K0XN0(K0, a4, b, c4);
1460#endif // M0 > 4
1461#if M0 > 5
1462 ARM_DOT_K0XN0(K0, a5, b, c5);
1463#endif // M0 > 5
1464#if M0 > 6
1465 ARM_DOT_K0XN0(K0, a6, b, c6);
1466#endif // M0 > 6
1467#if M0 > 7
1468 ARM_DOT_K0XN0(K0, a7, b, c7);
1469#endif // M0 > 7
1470
1471 lhs_offset += K0 * sizeof(DATA_TYPE);
1472 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1473 }
1474
1475#if LEFTOVER_K != 0
1476 // Note: We cannot read out-of-bound elements from the RHS matrix because
1477 // the RHS width is always multiple of K0. This is not be true for the LHS matrix
1478
1479 union UNION_VEC_TYPE
1480 {
1481 DATA_TYPE s[K0];
1482 VEC_DATA_TYPE(DATA_TYPE, K0)
1483 v;
1484 };
1485
1486 union UNION_VEC_TYPE a0 = {.v = 0 };
1487#if M0 > 1
1488 union UNION_VEC_TYPE a1 = {.v = 0 };
1489#endif // M0 > 1
1490#if M0 > 2
1491 union UNION_VEC_TYPE a2 = {.v = 0 };
1492#endif // M0 > 2
1493#if M0 > 3
1494 union UNION_VEC_TYPE a3 = {.v = 0 };
1495#endif // M0 > 3
1496#if M0 > 4
1497 union UNION_VEC_TYPE a4 = {.v = 0 };
1498#endif // M0 > 4
1499#if M0 > 5
1500 union UNION_VEC_TYPE a5 = {.v = 0 };
1501#endif // M0 > 5
1502#if M0 > 6
1503 union UNION_VEC_TYPE a6 = {.v = 0 };
1504#endif // M0 > 6
1505#if M0 > 7
1506 union UNION_VEC_TYPE a7 = {.v = 0 };
1507#endif // M0 > 7
1508
1509 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
1510
1511 // Load from RHS matrix
1512 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
1513
1514 // Load from LHS matrix
1515 for(int k = 0; k < LEFTOVER_K; ++k)
1516 {
1517 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
1518#if M0 > 1
1519 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
1520#endif // M0 > 1
1521#if M0 > 2
1522 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
1523#endif // M0 > 2
1524#if M0 > 3
1525 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
1526#endif // M0 > 3
1527#if M0 > 4
1528 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
1529#endif // M0 > 4
1530#if M0 > 5
1531 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
1532#endif // M0 > 5
1533#if M0 > 6
1534 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
1535#endif // M0 > 6
1536#if M0 > 7
1537 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
1538#endif // M0 > 7
1539
1540 lhs_offset += sizeof(DATA_TYPE);
1541 }
1542
1543 // Accumulate
1544 ARM_DOT_K0XN0(K0, a0.v, b, c0);
1545#if M0 > 1
1546 ARM_DOT_K0XN0(K0, a1.v, b, c1);
1547#endif // M0 > 1
1548#if M0 > 2
1549 ARM_DOT_K0XN0(K0, a2.v, b, c2);
1550#endif // M0 > 2
1551#if M0 > 3
1552 ARM_DOT_K0XN0(K0, a3.v, b, c3);
1553#endif // M0 > 3
1554#if M0 > 4
1555 ARM_DOT_K0XN0(K0, a4.v, b, c4);
1556#endif // M0 > 4
1557#if M0 > 5
1558 ARM_DOT_K0XN0(K0, a5.v, b, c5);
1559#endif // M0 > 5
1560#if M0 > 6
1561 ARM_DOT_K0XN0(K0, a6.v, b, c6);
1562#endif // M0 > 6
1563#if M0 > 7
1564 ARM_DOT_K0XN0(K0, a7.v, b, c7);
1565#endif // M0 > 7
1566
1567#endif // LEFTOVER_K != 0
1568
SiCong Li406a13f2020-07-15 12:09:58 +01001569 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001570
1571 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1572
1573#if defined(REINTERPRET_OUTPUT_AS_3D)
1574
1575 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001576 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001577
1578 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1579 // multiply dst_stride_z by DEPTH_GEMM3D
1580 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1581
1582#else // defined(REINTERPRET_OUTPUT_AS_3D)
1583
1584 // Add offset for batched GEMM
1585 dst_addr += z * dst_stride_z;
1586
1587#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1588
1589 // Multiply by the weight of matrix-matrix product and store the result
1590#if defined(ALPHA)
1591 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
1592#endif // defined(ALPHA)
1593
1594 // Add beta*bias
1595#if defined(BETA)
1596#if defined(BROADCAST_BIAS)
1597 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1598
1599 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1600
1601#ifndef UNIT_BETA
1602 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1603#endif // UNIT_BIAS
1604
1605 // c = c + bias[broadcasted]
1606 ADD_BLOCK_BROADCAST(M0, c, bias0);
1607
1608#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01001609 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001610
1611 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1612
1613#ifndef UNIT_BETA
1614 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
1615#endif // UNIT_BIAS
1616
1617 // c = c + bias
1618 ADD_BLOCK(M0, c, bias);
1619
1620#endif // defined(BROADCAST_BIAS)
1621#endif // defined(BETA)
1622
1623#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01001624 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001625#endif // defined(ACTIVATION_TYPE)
1626
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01001627 const bool cond_y = y == 0;
1628 const bool cond_x = ((x + 1) * N0 >= N);
1629
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001630 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001631 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001632
1633#undef RHS_BLOCK_SIZE
1634#undef RHS_OFFSET_X
1635#undef RHS_STEP_X
1636#undef LEFTOVER_K
1637#undef PIXEL_UNIT
1638}
1639#endif // defined(OPENCL_IMAGE_SUPPORT)
1640
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001641#define VFMA(a, b, c) \
1642 ({ \
1643 c = fma(a, b, c); \
1644 })
1645
1646#if M0 == 1
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001647#define VFMA_M0xN0(i, a, b, c) \
1648 ({ \
1649 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001650 })
1651#elif M0 == 2 // M0 == 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001652#define VFMA_M0xN0(i, a, b, c) \
1653 ({ \
1654 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1655 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001656 })
1657#elif M0 == 3 // M0 == 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001658#define VFMA_M0xN0(i, a, b, c) \
1659 ({ \
1660 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1661 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1662 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001663 })
1664#elif M0 == 4 // M0 == 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001665#define VFMA_M0xN0(i, a, b, c) \
1666 ({ \
1667 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1668 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1669 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1670 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001671 })
1672#elif M0 == 5 // M0 == 5
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001673#define VFMA_M0xN0(i, a, b, c) \
1674 ({ \
1675 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1676 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1677 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1678 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1679 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001680 })
1681#elif M0 == 6 // M0 == 6
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001682#define VFMA_M0xN0(i, a, b, c) \
1683 ({ \
1684 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1685 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1686 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1687 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1688 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1689 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001690 })
1691#elif M0 == 7 // M0 == 7
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001692#define VFMA_M0xN0(i, a, b, c) \
1693 ({ \
1694 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1695 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1696 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1697 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1698 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1699 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1700 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001701 })
1702#elif M0 == 8 // M0 == 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001703#define VFMA_M0xN0(i, a, b, c) \
1704 ({ \
1705 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
1706 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
1707 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
1708 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
1709 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
1710 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
1711 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
1712 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001713 })
1714#else // M0 not supported
1715#error "M0 not supported"
1716#endif // M0 not supported
1717
1718/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1719 * The LHS matrix is NOT reshaped
1720 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
1721 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001722 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001723 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
1724 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
1725 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
1726 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001727 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01001728 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1729 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001730 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1731 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1732 * - N0 = 2, 3, 4, 8, 16
1733 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001734 * - H0 >= 1
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001735 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01001736 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01001737 * The activation function is performed after the bias addition
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001738 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1739 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1740 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1741 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1742 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1743 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1744 *
Sheri Zhang1a378102020-04-30 12:59:39 +01001745 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
1746 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001747 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001748 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001749 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Sheri Zhang1a378102020-04-30 12:59:39 +01001750 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001751 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1752 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1753 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1754 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1755 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1756 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001757 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
1758 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001759 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001760 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001761 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
1762 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
1763 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1764 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1765 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1766 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1767 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1768 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Sheri Zhang1a378102020-04-30 12:59:39 +01001769 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001770 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01001771 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001772 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1773 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1774 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001775 */
1776__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
1777 IMAGE_DECLARATION(rhs),
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001778#if defined(BETA)
1779 IMAGE_DECLARATION(bias),
1780#endif // defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001781 IMAGE_DECLARATION(dst),
1782 uint lhs_stride_z,
1783 uint rhs_stride_z,
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001784#if defined(BETA)
1785 uint bias_stride_z,
1786#endif //defined(BETA)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001787 uint dst_stride_z
1788#if defined(REINTERPRET_INPUT_AS_3D)
1789 ,
1790 uint lhs_cross_plane_pad
1791#endif // REINTERPRET_INPUT_AS_3D
1792#if defined(REINTERPRET_OUTPUT_AS_3D)
1793 ,
1794 uint dst_cross_plane_pad
1795#endif // REINTERPRET_OUTPUT_AS_3D
1796 )
1797{
1798 // Block size
1799#define RHS_BLOCK_SIZE ((K0) * (N0))
1800
1801 // RHS offset and step X
1802#if defined(RHS_INTERLEAVE)
1803#define RHS_OFFSET_X (N0)
1804#define RHS_STEP_X ((N0) * (H0))
1805#define RHS_STEP_LOOP (1)
1806#else // defined(RHS_INTERLEAVE)
1807#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1808#define RHS_STEP_X (N0)
1809#define RHS_STEP_LOOP (H0)
1810#endif // defined(RHS_INTERLEAVE)
1811
1812 uint x = get_global_id(0);
1813 uint y = get_global_id(1);
1814 uint z = get_global_id(2);
1815
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001816#if defined(DUMMY_WORK_ITEMS)
1817 if((x * N0 >= N) || (y * M0 >= M))
1818 {
1819 return;
1820 }
1821#endif // defined(DUMMY_WORK_ITEMS)
1822
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001823 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01001824 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001825
Sheri Zhang1a378102020-04-30 12:59:39 +01001826 // Compute RHS reshaped matrix address
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001827 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
1828
1829#if defined(MATRIX_B_DEPTH)
1830 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1831 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1832#else // defined(MATRIX_B_DEPTH)
1833 rhs_offset += z * rhs_stride_z;
1834#endif // defined(MATRIX_B_DEPTH)
1835
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001836 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
1837 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001838
1839#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001840
1841 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001842 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001843
1844 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1845 // multiply lhs_stride_z by DEPTH_GEMM3D
1846 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1847
1848#else // defined(REINTERPRET_INPUT_AS_3D)
1849
1850 // Add offset for batched GEMM
1851 lhs_offset += z * lhs_stride_z;
1852
1853#endif // defined(REINTERPRET_INPUT_AS_3D)
1854
1855 // Initialize the accumulators
1856 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1857
1858 int i = 0;
1859 for(; i <= (K - K0); i += K0)
1860 {
1861 // Supported cases (M0, K0):
1862 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
1863 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
1864 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
1865 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
1866 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
1867 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
1868 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
1869 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
1870 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01001871 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001872
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001873 VEC_DATA_TYPE(DATA_TYPE, N0)
1874 b0;
1875
1876 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1877 VFMA_M0xN0(0, a, b0, c);
1878 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
1879 VFMA_M0xN0(1, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001880#if K0 > 2
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001881 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
1882 VFMA_M0xN0(2, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001883#endif // K0 > 2
1884#if K0 > 3
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001885 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
1886 VFMA_M0xN0(3, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001887#endif // K0 > 3
1888#if K0 > 4
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001889 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
1890 VFMA_M0xN0(4, a, b0, c);
1891 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
1892 VFMA_M0xN0(5, a, b0, c);
1893 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
1894 VFMA_M0xN0(6, a, b0, c);
1895 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
1896 VFMA_M0xN0(7, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001897#endif // K0 > 4
1898#if K0 > 8
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001899 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
1900 VFMA_M0xN0(8, a, b0, c);
1901 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
1902 VFMA_M0xN0(9, a, b0, c);
1903 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
1904 VFMA_M0xN0(A, a, b0, c);
1905 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
1906 VFMA_M0xN0(B, a, b0, c);
1907 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
1908 VFMA_M0xN0(C, a, b0, c);
1909 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
1910 VFMA_M0xN0(D, a, b0, c);
1911 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
1912 VFMA_M0xN0(E, a, b0, c);
1913 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
1914 VFMA_M0xN0(F, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001915#endif // K0 > 8
1916
1917 lhs_offset += K0 * sizeof(DATA_TYPE);
1918 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
1919 }
1920
1921 // Left-over accumulations
1922 for(; i < K; ++i)
1923 {
1924 // Load values from LHS matrix
1925 VEC_DATA_TYPE(DATA_TYPE, 2)
1926 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
1927#if M0 > 1
1928 VEC_DATA_TYPE(DATA_TYPE, 2)
1929 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
1930#endif // M0 > 1
1931#if M0 > 2
1932 VEC_DATA_TYPE(DATA_TYPE, 2)
1933 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
1934#endif // M0 > 2
1935#if M0 > 3
1936 VEC_DATA_TYPE(DATA_TYPE, 2)
1937 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
1938#endif // M0 > 3
1939#if M0 > 4
1940 VEC_DATA_TYPE(DATA_TYPE, 2)
1941 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
1942#endif // M0 > 4
1943#if M0 > 5
1944 VEC_DATA_TYPE(DATA_TYPE, 2)
1945 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
1946#endif // M0 > 5
1947#if M0 > 6
1948 VEC_DATA_TYPE(DATA_TYPE, 2)
1949 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
1950#endif // M0 > 6
1951#if M0 > 7
1952 VEC_DATA_TYPE(DATA_TYPE, 2)
giuros01b3204e72019-04-01 13:50:22 +01001953 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001954#endif // M0 > 7
1955
Gian Marco Iodice781cba72020-06-19 16:56:57 +01001956 VEC_DATA_TYPE(DATA_TYPE, N0)
1957 b0;
1958
1959 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
1960 VFMA_M0xN0(0, a, b0, c);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001961
1962 lhs_offset += sizeof(DATA_TYPE);
1963 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
1964 }
1965
SiCong Li406a13f2020-07-15 12:09:58 +01001966 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001967
1968 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1969
1970#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001971 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01001972 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001973
1974 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1975 // multiply dst_stride_z by DEPTH_GEMM3D
1976 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1977
1978#else // defined(REINTERPRET_OUTPUT_AS_3D)
1979
1980 // Add offset for batched GEMM
1981 dst_addr += z * dst_stride_z;
1982
1983#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1984
1985 // Multiply by the weight of matrix-matrix product and store the result
1986#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01001987 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00001988#endif // defined(ALPHA)
1989
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01001990 // Add beta*bias
1991#if defined(BETA)
1992#if defined(BROADCAST_BIAS)
1993 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
1994
1995 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
1996
1997#ifndef UNIT_BETA
1998 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
1999#endif // UNIT_BIAS
2000
2001 // c = c + bias[broadcasted]
2002 ADD_BLOCK_BROADCAST(M0, c, bias0);
2003
2004#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002005 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Georgios Pinitasb0f342e2019-05-21 13:32:43 +01002006
2007 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2008
2009#ifndef UNIT_BETA
2010 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2011#endif // UNIT_BIAS
2012
2013 // c = c + bias
2014 ADD_BLOCK(M0, c, bias);
2015
2016#endif // defined(BROADCAST_BIAS)
2017#endif // defined(BETA)
2018
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002019#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002020 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002021#endif // defined(ACTIVATION_TYPE)
2022
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002023 const bool cond_y = y == 0;
2024 const bool cond_x = ((x + 1) * N0 >= N);
2025
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002026 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002027 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodiceba5e0962019-03-11 12:17:44 +00002028
2029#undef RHS_BLOCK_SIZE
2030#undef RHS_OFFSET_X
2031#undef RHS_STEP_X
2032}
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002033
2034#if defined(OPENCL_IMAGE_SUPPORT)
2035/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2036 * The LHS matrix is NOT reshaped
2037 * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
2038 *
2039 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2040 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2041 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
2042 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2043 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2044 * could be different from the value returned by get_image_height(rhs_img).
2045 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
2046 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
2047 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2048 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
SiCong Li3a501662020-06-26 10:02:06 +01002049 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2050 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002051 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2052 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
2053 * - N0 = 4, 8, 16
2054 * - K0 = 4, 8, 16
2055 * - H0 >= 1
2056 *
2057 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2058 * The activation function is performed after the bias addition
2059 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
2060 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
2061 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2062 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2063 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2064 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
2065 *
2066 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32
2067 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
2068 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2069 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
2070 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2071 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
2072 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2073 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2074 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2075 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2076 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2077 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2078 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2079 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2080 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2081 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2082 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2083 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2084 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2085 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
2086 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2087 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2088 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2089 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
2090 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2091 */
2092__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
2093 __read_only image2d_t rhs_img,
2094#if defined(BETA)
2095 IMAGE_DECLARATION(bias),
2096#endif // defined(BETA)
2097 IMAGE_DECLARATION(dst),
2098 uint lhs_stride_z,
2099 uint rhs_stride_z,
2100#if defined(BETA)
2101 uint bias_stride_z,
2102#endif //defined(BETA)
2103 uint dst_stride_z
2104#if defined(REINTERPRET_INPUT_AS_3D)
2105 ,
2106 uint lhs_cross_plane_pad
2107#endif // REINTERPRET_INPUT_AS_3D
2108#if defined(REINTERPRET_OUTPUT_AS_3D)
2109 ,
2110 uint dst_cross_plane_pad
2111#endif // REINTERPRET_OUTPUT_AS_3D
2112 )
2113{
2114 // Pixel unit
2115#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
2116
2117 // Block size
2118#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
2119
2120 // RHS offset and step X
2121#if defined(RHS_INTERLEAVE)
2122#define RHS_OFFSET_X (PIXEL_UNIT)
2123#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
2124#else // defined(RHS_INTERLEAVE)
2125#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2126#define RHS_STEP_X (PIXEL_UNIT)
2127#endif // defined(RHS_INTERLEAVE)
2128
2129 uint x = get_global_id(0);
2130 uint y = get_global_id(1);
2131 uint z = get_global_id(2);
2132
2133#if defined(DUMMY_WORK_ITEMS)
2134 if((x * N0 >= N) || (y * M0 >= M))
2135 {
2136 return;
2137 }
2138#endif // defined(DUMMY_WORK_ITEMS)
2139
2140 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01002141 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002142
2143#if defined(MATRIX_B_DEPTH)
2144 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2145 const uint z_rhs = (z % MATRIX_B_DEPTH);
2146#else // defined(MATRIX_B_DEPTH)
2147 const uint z_rhs = z;
2148#endif // defined(MATRIX_B_DEPTH)
2149
2150 // Compute RHS matrix coordinates
2151 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
2152 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
2153
2154 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
2155 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2156
2157#if defined(REINTERPRET_INPUT_AS_3D)
2158
2159 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01002160 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002161
2162 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2163 // multiply lhs_stride_z by DEPTH_GEMM3D
2164 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
2165
2166#else // defined(REINTERPRET_INPUT_AS_3D)
2167
2168 // Add offset for batched GEMM
2169 lhs_offset += z * lhs_stride_z;
2170
2171#endif // defined(REINTERPRET_INPUT_AS_3D)
2172
2173 // Initialize the accumulators
2174 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
2175
2176 int i = 0;
2177 for(; i <= (K - K0); i += K0)
2178 {
2179 // Load values from LHS matrix
2180 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
2181
2182 VEC_DATA_TYPE(DATA_TYPE, N0)
2183 b0;
2184
2185 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2186 VFMA_M0xN0(0, a, b0, c);
2187 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
2188 VFMA_M0xN0(1, a, b0, c);
2189#if K0 > 2
2190 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
2191 VFMA_M0xN0(2, a, b0, c);
2192#endif // K0 > 2
2193#if K0 > 3
2194 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
2195 VFMA_M0xN0(3, a, b0, c);
2196#endif // K0 > 3
2197#if K0 > 4
2198 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
2199 VFMA_M0xN0(4, a, b0, c);
2200 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
2201 VFMA_M0xN0(5, a, b0, c);
2202 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
2203 VFMA_M0xN0(6, a, b0, c);
2204 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
2205 VFMA_M0xN0(7, a, b0, c);
2206#endif // K0 > 4
2207#if K0 > 8
2208 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
2209 VFMA_M0xN0(8, a, b0, c);
2210 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
2211 VFMA_M0xN0(9, a, b0, c);
2212 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
2213 VFMA_M0xN0(A, a, b0, c);
2214 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
2215 VFMA_M0xN0(B, a, b0, c);
2216 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
2217 VFMA_M0xN0(C, a, b0, c);
2218 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
2219 VFMA_M0xN0(D, a, b0, c);
2220 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
2221 VFMA_M0xN0(E, a, b0, c);
2222 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
2223 VFMA_M0xN0(F, a, b0, c);
2224#endif // K0 > 8
2225
2226 lhs_offset += K0 * sizeof(DATA_TYPE);
2227 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
2228 }
2229
2230 // Left-over accumulations
2231 for(; i < K; ++i)
2232 {
2233 // Load values from LHS matrix
2234 VEC_DATA_TYPE(DATA_TYPE, 2)
2235 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
2236#if M0 > 1
2237 VEC_DATA_TYPE(DATA_TYPE, 2)
2238 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
2239#endif // M0 > 1
2240#if M0 > 2
2241 VEC_DATA_TYPE(DATA_TYPE, 2)
2242 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
2243#endif // M0 > 2
2244#if M0 > 3
2245 VEC_DATA_TYPE(DATA_TYPE, 2)
2246 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
2247#endif // M0 > 3
2248#if M0 > 4
2249 VEC_DATA_TYPE(DATA_TYPE, 2)
2250 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
2251#endif // M0 > 4
2252#if M0 > 5
2253 VEC_DATA_TYPE(DATA_TYPE, 2)
2254 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
2255#endif // M0 > 5
2256#if M0 > 6
2257 VEC_DATA_TYPE(DATA_TYPE, 2)
2258 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
2259#endif // M0 > 6
2260#if M0 > 7
2261 VEC_DATA_TYPE(DATA_TYPE, 2)
2262 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
2263#endif // M0 > 7
2264
2265 VEC_DATA_TYPE(DATA_TYPE, N0)
2266 b0;
2267 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
2268
2269 VFMA_M0xN0(0, a, b0, c);
2270
2271 lhs_offset += sizeof(DATA_TYPE);
2272 x_rhs += RHS_STEP_X;
2273 }
2274
SiCong Li406a13f2020-07-15 12:09:58 +01002275 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002276
2277 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
2278
2279#if defined(REINTERPRET_OUTPUT_AS_3D)
2280 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01002281 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002282
2283 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2284 // multiply dst_stride_z by DEPTH_GEMM3D
2285 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2286
2287#else // defined(REINTERPRET_OUTPUT_AS_3D)
2288
2289 // Add offset for batched GEMM
2290 dst_addr += z * dst_stride_z;
2291
2292#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2293
2294 // Multiply by the weight of matrix-matrix product and store the result
2295#if defined(ALPHA)
2296 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2297#endif // defined(ALPHA)
2298
2299 // Add beta*bias
2300#if defined(BETA)
2301#if defined(BROADCAST_BIAS)
2302 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2303
2304 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2305
2306#ifndef UNIT_BETA
2307 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2308#endif // UNIT_BIAS
2309
2310 // c = c + bias[broadcasted]
2311 ADD_BLOCK_BROADCAST(M0, c, bias0);
2312
2313#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01002314 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002315
2316 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2317
2318#ifndef UNIT_BETA
2319 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2320#endif // UNIT_BIAS
2321
2322 // c = c + bias
2323 ADD_BLOCK(M0, c, bias);
2324
2325#endif // defined(BROADCAST_BIAS)
2326#endif // defined(BETA)
2327
2328#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01002329 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002330#endif // defined(ACTIVATION_TYPE)
2331
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002332 const bool cond_y = y == 0;
2333 const bool cond_x = ((x + 1) * N0 >= N);
2334
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002335 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002336 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002337
2338#undef RHS_BLOCK_SIZE
2339#undef RHS_OFFSET_X
2340#undef RHS_STEP_X
2341}
2342#endif // defined(OPENCL_IMAGE_SUPPORT)
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002343#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002344
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002345#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002346
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002347#if defined(MIXED_PRECISION)
2348#if K0 == 2
2349#define ARM_DOT_K0(a, b, c) \
2350 ({ \
2351 c += a.s0 * b.s0; \
2352 c += a.s1 * b.s1; \
2353 })
2354#elif K0 == 3 // K0 == 3
2355#define ARM_DOT_K0(a, b, c) \
2356 ({ \
2357 c += a.s0 * b.s0; \
2358 c += a.s1 * b.s1; \
2359 c += a.s2 * b.s2; \
2360 })
2361#elif K0 == 4 // K0 == 4
2362#define ARM_DOT_K0(a, b, c) \
2363 ({ \
2364 c += a.s0 * b.s0; \
2365 c += a.s1 * b.s1; \
2366 c += a.s2 * b.s2; \
2367 c += a.s3 * b.s3; \
2368 })
2369#elif K0 == 8 // K0 == 8
2370#define ARM_DOT_K0(a, b, c) \
2371 ({ \
2372 c += a.s0 * b.s0; \
2373 c += a.s1 * b.s1; \
2374 c += a.s2 * b.s2; \
2375 c += a.s3 * b.s3; \
2376 c += a.s4 * b.s4; \
2377 c += a.s5 * b.s5; \
2378 c += a.s6 * b.s6; \
2379 c += a.s7 * b.s7; \
2380 })
2381#elif K0 == 16 // K0 == 16
2382#define ARM_DOT_K0(a, b, c) \
2383 ({ \
2384 c += a.s0 * b.s0; \
2385 c += a.s1 * b.s1; \
2386 c += a.s2 * b.s2; \
2387 c += a.s3 * b.s3; \
2388 c += a.s4 * b.s4; \
2389 c += a.s5 * b.s5; \
2390 c += a.s6 * b.s6; \
2391 c += a.s7 * b.s7; \
2392 c += a.s8 * b.s8; \
2393 c += a.s9 * b.s9; \
2394 c += a.sA * b.sA; \
2395 c += a.sB * b.sB; \
2396 c += a.sC * b.sC; \
2397 c += a.sD * b.sD; \
2398 c += a.sE * b.sE; \
2399 c += a.sF * b.sF; \
2400 })
2401#else // K0 not supported
2402#error "K0 value not supported"
2403#endif // K0 conditions
2404#else // defined(MIXED_PRECISION)
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002405#if K0 == 2
2406#define ARM_DOT_K0(a, b, c) \
2407 ({ \
2408 c = fma(a.s0, b.s0, c); \
2409 c = fma(a.s1, b.s1, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002410 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002411#elif K0 == 3 // K0 == 3
2412#define ARM_DOT_K0(a, b, c) \
2413 ({ \
2414 c = fma(a.s0, b.s0, c); \
2415 c = fma(a.s1, b.s1, c); \
2416 c = fma(a.s2, b.s2, c); \
2417 })
2418#elif K0 == 4 // K0 == 4
2419#define ARM_DOT_K0(a, b, c) \
2420 ({ \
2421 c = fma(a.s0, b.s0, c); \
2422 c = fma(a.s1, b.s1, c); \
2423 c = fma(a.s2, b.s2, c); \
2424 c = fma(a.s3, b.s3, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002425 })
2426#elif K0 == 8 // K0 == 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002427#define ARM_DOT_K0(a, b, c) \
2428 ({ \
2429 c = fma(a.s0, b.s0, c); \
2430 c = fma(a.s1, b.s1, c); \
2431 c = fma(a.s2, b.s2, c); \
2432 c = fma(a.s3, b.s3, c); \
2433 c = fma(a.s4, b.s4, c); \
2434 c = fma(a.s5, b.s5, c); \
2435 c = fma(a.s6, b.s6, c); \
2436 c = fma(a.s7, b.s7, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002437 })
2438#elif K0 == 16 // K0 == 16
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002439#define ARM_DOT_K0(a, b, c) \
2440 ({ \
2441 c = fma(a.s0, b.s0, c); \
2442 c = fma(a.s1, b.s1, c); \
2443 c = fma(a.s2, b.s2, c); \
2444 c = fma(a.s3, b.s3, c); \
2445 c = fma(a.s4, b.s4, c); \
2446 c = fma(a.s5, b.s5, c); \
2447 c = fma(a.s6, b.s6, c); \
2448 c = fma(a.s7, b.s7, c); \
2449 c = fma(a.s8, b.s8, c); \
2450 c = fma(a.s9, b.s9, c); \
2451 c = fma(a.sA, b.sA, c); \
2452 c = fma(a.sB, b.sB, c); \
2453 c = fma(a.sC, b.sC, c); \
2454 c = fma(a.sD, b.sD, c); \
2455 c = fma(a.sE, b.sE, c); \
2456 c = fma(a.sF, b.sF, c); \
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002457 })
2458#else // K0 not supported
2459#error "K0 value not supported"
2460#endif // K0 conditions
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002461#endif // defined(MIXED_PRECISION)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002462
2463#if N0 == 2
2464#define ARM_DOT_K0XN0(a, b, c) \
2465 ({ \
2466 ARM_DOT_K0((a), (b##0), (c.s0)); \
2467 ARM_DOT_K0((a), (b##1), (c.s1)); \
2468 })
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002469#elif N0 == 3 // N0 == 3
2470#define ARM_DOT_K0XN0(a, b, c) \
2471 ({ \
2472 ARM_DOT_K0((a), (b##0), (c.s0)); \
2473 ARM_DOT_K0((a), (b##1), (c.s1)); \
2474 ARM_DOT_K0((a), (b##2), (c.s2)); \
2475 })
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002476#elif N0 == 4 // N0 == 4
2477#define ARM_DOT_K0XN0(a, b, c) \
2478 ({ \
2479 ARM_DOT_K0((a), (b##0), (c.s0)); \
2480 ARM_DOT_K0((a), (b##1), (c.s1)); \
2481 ARM_DOT_K0((a), (b##2), (c.s2)); \
2482 ARM_DOT_K0((a), (b##3), (c.s3)); \
2483 })
2484#elif N0 == 8 // N0 == 8
2485#define ARM_DOT_K0XN0(a, b, c) \
2486 ({ \
2487 ARM_DOT_K0((a), (b##0), (c.s0)); \
2488 ARM_DOT_K0((a), (b##1), (c.s1)); \
2489 ARM_DOT_K0((a), (b##2), (c.s2)); \
2490 ARM_DOT_K0((a), (b##3), (c.s3)); \
2491 ARM_DOT_K0((a), (b##4), (c.s4)); \
2492 ARM_DOT_K0((a), (b##5), (c.s5)); \
2493 ARM_DOT_K0((a), (b##6), (c.s6)); \
2494 ARM_DOT_K0((a), (b##7), (c.s7)); \
2495 })
2496#elif N0 == 16 // N0 == 16
2497#define ARM_DOT_K0XN0(a, b, c) \
2498 ({ \
2499 ARM_DOT_K0((a), (b##0), (c.s0)); \
2500 ARM_DOT_K0((a), (b##1), (c.s1)); \
2501 ARM_DOT_K0((a), (b##2), (c.s2)); \
2502 ARM_DOT_K0((a), (b##3), (c.s3)); \
2503 ARM_DOT_K0((a), (b##4), (c.s4)); \
2504 ARM_DOT_K0((a), (b##5), (c.s5)); \
2505 ARM_DOT_K0((a), (b##6), (c.s6)); \
2506 ARM_DOT_K0((a), (b##7), (c.s7)); \
2507 ARM_DOT_K0((a), (b##8), (c.s8)); \
2508 ARM_DOT_K0((a), (b##9), (c.s9)); \
2509 ARM_DOT_K0((a), (b##A), (c.sA)); \
2510 ARM_DOT_K0((a), (b##B), (c.sB)); \
2511 ARM_DOT_K0((a), (b##C), (c.sC)); \
2512 ARM_DOT_K0((a), (b##D), (c.sD)); \
2513 ARM_DOT_K0((a), (b##E), (c.sE)); \
2514 ARM_DOT_K0((a), (b##F), (c.sF)); \
2515 })
2516#else // N0 not supported
2517#error "N0 value not supported"
2518#endif // N0 conditions
2519
2520/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
2521 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2522 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2523 *
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002524 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2525 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2526 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002527 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002528 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002529 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2530 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2531 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002532 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2533 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002534 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2535 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002536 * @note Only the following configurations of M0, N0 and K0 are currently supported:
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01002537 * - M0 = 2, 3, 4, 5, 6, 7, 8
Gian Marco Iodicebacfec52019-01-11 11:30:55 +00002538 * - N0 = 2, 3, 4, 8, 16
2539 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00002540 * - V0 >= 1
2541 * - H0 >= 1
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002542 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002543 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002544 * The activation function is performed after the bias addition
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01002545 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002546 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2547 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2548 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2549 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2550 *
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002551 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
2552 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2553 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2554 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2555 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2556 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2557 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
2558 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
2559 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2560 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
2561 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2562 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
2563 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2564 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2565 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2566 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2567 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2568 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2569 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2570 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2571 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2572 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2573 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2574 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002575 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002576 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2577 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2578 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2579 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2580 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002581 */
2582__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
2583 IMAGE_DECLARATION(rhs),
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002584#if defined(BETA)
2585 IMAGE_DECLARATION(bias),
2586#endif // defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002587 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002588 uint k,
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002589 uint lhs_stride_z,
2590 uint rhs_stride_z,
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002591#if defined(BETA)
2592 uint bias_stride_z,
2593#endif //defined(BETA)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002594 uint dst_stride_z
2595#if defined(REINTERPRET_OUTPUT_AS_3D)
2596 ,
2597 uint dst_cross_plane_pad
2598#endif // REINTERPRET_OUTPUT_AS_3D
2599 )
2600{
2601 // Block size
2602#define LHS_BLOCK_SIZE ((K0) * (M0))
2603
2604#if defined(LHS_INTERLEAVE)
2605#define LHS_OFFSET_X (K0)
2606#define LHS_STEP_X ((K0) * (V0))
2607#define LHS_STEP_LOOP (1)
2608#else // defined(INTERLEAVE)
2609#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2610#define LHS_STEP_X (K0)
2611#define LHS_STEP_LOOP (V0)
2612#endif // defined(INTERLEAVE)
2613
2614 // Block size
2615#define RHS_BLOCK_SIZE ((K0) * (N0))
2616
2617 // RHS offset and step X
2618#if defined(RHS_INTERLEAVE)
2619#define RHS_OFFSET_X (K0)
2620#define RHS_STEP_X ((K0) * (H0))
2621#define RHS_STEP_LOOP (1)
2622#else // defined(RHS_INTERLEAVE)
2623#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2624#define RHS_STEP_X (K0)
2625#define RHS_STEP_LOOP (H0)
2626#endif // defined(RHS_INTERLEAVE)
2627
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00002628#if defined(DUMMY_WORK_ITEMS)
2629 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2630 {
2631 return;
2632 }
2633#endif // defined(DUMMY_WORK_ITEMS)
2634
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002635 // Compute LHS matrix address
2636 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2637 (get_global_id(2) * lhs_stride_z);
2638
2639 // Compute RHS matrix address
2640 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
2641
2642#if defined(MATRIX_B_DEPTH)
2643 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2644 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
2645#else // defined(MATRIX_B_DEPTH)
2646 rhs_addr += get_global_id(2) * rhs_stride_z;
2647#endif // defined(MATRIX_B_DEPTH)
2648
2649 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002650 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002651
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002652 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2653 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
Usama Arif0681e3b2019-04-25 14:28:07 +01002654
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002655 for(int i = 0; i < k; i += K0)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002656 {
2657 // Supported cases (M0, K0):
Gian Marco Iodiceadc53952019-02-15 11:10:31 +00002658 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
2659 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
2660 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
2661 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
2662 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
2663 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
2664 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
2665 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002666 // Load values from LHS matrix
Usama Arif0681e3b2019-04-25 14:28:07 +01002667 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002668
2669 // Load values from RHS matrix
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002670 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002671
2672 // Accumulate
2673 ARM_DOT_K0XN0(a0, b, c0);
2674#if M0 > 1
2675 ARM_DOT_K0XN0(a1, b, c1);
2676#endif // M0 > 1
2677#if M0 > 2
2678 ARM_DOT_K0XN0(a2, b, c2);
2679#endif // M0 > 2
2680#if M0 > 3
2681 ARM_DOT_K0XN0(a3, b, c3);
2682#endif // M0 > 3
2683#if M0 > 4
2684 ARM_DOT_K0XN0(a4, b, c4);
2685#endif // M0 > 4
2686#if M0 > 5
2687 ARM_DOT_K0XN0(a5, b, c5);
2688#endif // M0 > 5
2689#if M0 > 6
2690 ARM_DOT_K0XN0(a6, b, c6);
2691#endif // M0 > 6
2692#if M0 > 7
2693 ARM_DOT_K0XN0(a7, b, c7);
2694#endif // M0 > 7
2695
2696 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2697 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
2698 }
2699
2700 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2701
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002702 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002703
2704#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002705
2706 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00002707 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002708 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2709 // multiply dst_stride_z by DEPTH_GEMM3D
2710 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2711
2712#else // defined(REINTERPRET_OUTPUT_AS_3D)
2713
2714 // Add offset for batched GEMM
2715 dst_addr += get_global_id(2) * dst_stride_z;
2716
2717#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2718
2719 // Multiply by the weight of matrix-matrix product and store the result
2720#if defined(ALPHA)
Usama Arif0681e3b2019-04-25 14:28:07 +01002721 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002722#endif // defined(ALPHA)
2723
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002724 // Add beta*bias
2725#if defined(BETA)
2726#if defined(BROADCAST_BIAS)
2727 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2728
2729 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2730
2731#ifndef UNIT_BETA
2732 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
2733#endif // UNIT_BIAS
2734
2735 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002736#if defined(MIXED_PRECISION)
2737 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2738 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
2739#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002740 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002741#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002742
2743#else // defined(BROADCAST_BIAS)
2744 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
2745 2) * bias_stride_z;
2746
2747 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
2748
2749#ifndef UNIT_BETA
2750 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
2751#endif // UNIT_BIAS
2752
2753 // c = c + bias
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002754#if defined(MIXED_PRECISION)
2755 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
2756 ADD_BLOCK(M0, c, bias_hp);
2757#else // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002758 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002759#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002760
2761#endif // defined(BROADCAST_BIAS)
2762#endif // defined(BETA)
2763
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002764#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002765#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002766 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002767#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01002768 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01002769#endif // defined(MIXED_PRECISION)
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01002770#endif // defined(ACTIVATION_TYPE)
2771
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002772 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
2773 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
2774
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002775 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002776#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002777 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002778 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002779#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01002780 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01002781#endif // defined(MIXED_PRECISION)
Gian Marco Iodicee16c8902019-06-14 16:11:10 +01002782
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002783#undef LHS_BLOCK_SIZE
2784#undef LHS_OFFSET_X
2785#undef LHS_STEP_X
2786#undef RHS_BLOCK_SIZE
2787#undef RHS_OFFSET_X
2788#undef RHS_STEP_X
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002789#undef LHS_STEP_LOOP
2790#undef RHS_STEP_LOOP
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00002791}
giuros01b3204e72019-04-01 13:50:22 +01002792
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002793#if defined(OPENCL_IMAGE_SUPPORT)
2794/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
2795 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
2796 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
2797 *
2798 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
2799 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
2800 * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
2801 * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
2802 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
2803 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01002804 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
2805 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
2806 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002807 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
2808 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
2809 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
2810 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
2811 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01002812 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
2813 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002814 * @note Only the following configurations of M0, N0 and K0 are currently supported:
2815 * - M0 = 2, 3, 4, 5, 6, 7, 8
2816 * - N0 = 4, 8, 16
2817 * - K0 = 4, 8, 16
2818 * - V0 >= 1
2819 * - H0 >= 1
2820 *
2821 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
2822 * The activation function is performed after the bias addition
2823 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
2824 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
2825 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
2826 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
2827 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
2828 *
2829 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
2830 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
2831 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2832 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
2833 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2834 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
2835 * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
2836 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
2837 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
2838 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
2839 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
2840 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
2841 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
2842 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
2843 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2844 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2845 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2846 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
2847 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002848 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002849 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
2850 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
2851 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
2852 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
2853 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
2854 */
2855__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
2856 __read_only image2d_t rhs_img,
2857#if defined(BETA)
2858 IMAGE_DECLARATION(bias),
2859#endif // defined(BETA)
2860 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01002861 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002862 uint lhs_stride_z,
2863 uint rhs_stride_z,
2864#if defined(BETA)
2865 uint bias_stride_z,
2866#endif //defined(BETA)
2867 uint dst_stride_z
2868#if defined(REINTERPRET_OUTPUT_AS_3D)
2869 ,
2870 uint dst_cross_plane_pad
2871#endif // REINTERPRET_OUTPUT_AS_3D
2872 )
2873{
2874 // Pixel unit
2875#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
2876
2877 // Block size
2878#define LHS_BLOCK_SIZE ((K0) * (M0))
2879
2880#if defined(LHS_INTERLEAVE)
2881#define LHS_OFFSET_X (K0)
2882#define LHS_STEP_X ((K0) * (V0))
2883#define LHS_STEP_LOOP (1)
2884#else // defined(INTERLEAVE)
2885#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
2886#define LHS_STEP_X (K0)
2887#define LHS_STEP_LOOP (V0)
2888#endif // defined(INTERLEAVE)
2889
2890 // Block size
2891#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
2892
2893 // RHS offset and step X
2894#if defined(RHS_INTERLEAVE)
2895#define RHS_OFFSET_X (PIXEL_UNIT)
2896#define RHS_STEP_X (PIXEL_UNIT * (H0))
2897#define RHS_STEP_LOOP (1)
2898#else // defined(RHS_INTERLEAVE)
2899#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
2900#define RHS_STEP_X PIXEL_UNIT
2901#define RHS_STEP_LOOP (H0)
2902#endif // defined(RHS_INTERLEAVE)
2903
2904#if defined(DUMMY_WORK_ITEMS)
2905 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
2906 {
2907 return;
2908 }
2909#endif // defined(DUMMY_WORK_ITEMS)
2910
2911 // Compute LHS matrix address
2912 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
2913 (get_global_id(2) * lhs_stride_z);
2914
2915#if defined(MATRIX_B_DEPTH)
2916 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
2917 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
2918#else // defined(MATRIX_B_DEPTH)
2919 const uint z_rhs = get_global_id(2);
2920#endif // defined(MATRIX_B_DEPTH)
2921
2922 // Compute RHS matrix coordinates
2923 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
2924 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
2925
2926 // Initialize the accumulators
2927 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
2928
2929 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
2930 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
2931
2932 for(int i = 0; i < K; i += K0)
2933 {
2934 // Load values from LHS matrix
2935 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
2936
2937 // Load values from RHS matrix stored in a cl_image
2938 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
2939 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
2940
2941 // Accumulate
2942 ARM_DOT_K0XN0(a0, b, c0);
2943#if M0 > 1
2944 ARM_DOT_K0XN0(a1, b, c1);
2945#endif // M0 > 1
2946#if M0 > 2
2947 ARM_DOT_K0XN0(a2, b, c2);
2948#endif // M0 > 2
2949#if M0 > 3
2950 ARM_DOT_K0XN0(a3, b, c3);
2951#endif // M0 > 3
2952#if M0 > 4
2953 ARM_DOT_K0XN0(a4, b, c4);
2954#endif // M0 > 4
2955#if M0 > 5
2956 ARM_DOT_K0XN0(a5, b, c5);
2957#endif // M0 > 5
2958#if M0 > 6
2959 ARM_DOT_K0XN0(a6, b, c6);
2960#endif // M0 > 6
2961#if M0 > 7
2962 ARM_DOT_K0XN0(a7, b, c7);
2963#endif // M0 > 7
2964
2965 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
2966
2967 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
2968 }
2969
2970 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
2971
2972 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
2973
2974#if defined(REINTERPRET_OUTPUT_AS_3D)
2975
2976 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00002977 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01002978 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2979 // multiply dst_stride_z by DEPTH_GEMM3D
2980 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
2981
2982#else // defined(REINTERPRET_OUTPUT_AS_3D)
2983
2984 // Add offset for batched GEMM
2985 dst_addr += get_global_id(2) * dst_stride_z;
2986
2987#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2988
2989 // Multiply by the weight of matrix-matrix product and store the result
2990#if defined(ALPHA)
2991 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
2992#endif // defined(ALPHA)
2993
2994 // Add beta*bias
2995#if defined(BETA)
2996#if defined(BROADCAST_BIAS)
2997 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
2998
2999 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3000
3001#ifndef UNIT_BETA
3002 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3003#endif // UNIT_BIAS
3004
3005 // c = c + bias[broadcasted]
3006#if defined(MIXED_PRECISION)
3007 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3008 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3009#else // defined(MIXED_PRECISION)
3010 ADD_BLOCK_BROADCAST(M0, c, bias0);
3011#endif // defined(MIXED_PRECISION)
3012
3013#else // defined(BROADCAST_BIAS)
3014 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3015 2) * bias_stride_z;
3016
3017 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3018
3019#ifndef UNIT_BETA
3020 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3021#endif // UNIT_BIAS
3022
3023 // c = c + bias
3024#if defined(MIXED_PRECISION)
3025 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3026 ADD_BLOCK(M0, c, bias_hp);
3027#else // defined(MIXED_PRECISION)
3028 ADD_BLOCK(M0, c, bias);
3029#endif // defined(MIXED_PRECISION)
3030
3031#endif // defined(BROADCAST_BIAS)
3032#endif // defined(BETA)
3033
3034#if defined(ACTIVATION_TYPE)
3035#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003036 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003037#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003038 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003039#endif // defined(MIXED_PRECISION)
3040#endif // defined(ACTIVATION_TYPE)
3041
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003042 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3043 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3044
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003045 // Store output block
3046#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003047 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003048 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003049#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003050 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003051#endif // defined(MIXED_PRECISION)
3052
3053#undef LHS_BLOCK_SIZE
3054#undef LHS_OFFSET_X
3055#undef LHS_STEP_X
3056#undef RHS_BLOCK_SIZE
3057#undef RHS_OFFSET_X
3058#undef RHS_STEP_X
3059#undef PIXEL_UNIT
3060#undef LHS_STEP_LOOP
3061#undef RHS_STEP_LOOP
3062}
3063#endif // defined(OPENCL_IMAGE_SUPPORT)
3064
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003065#if defined(LHS_TRANSPOSE)
3066
3067#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
3068
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003069#if defined(MIXED_PRECISION)
3070
3071#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3072#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003073#else // GPU_ARCH == GPU_ARCH_MIDGARD
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003074#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003075#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3076
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003077#else // defined(MIXED_PRECISION
3078
3079#if(GPU_ARCH == GPU_ARCH_MIDGARD)
3080#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
3081#else // GPU_ARCH == GPU_ARCH_MIDGARD
3082#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
3083#endif // GPU_ARCH == GPU_ARCH_MIDGARD
3084
3085#endif // defined(MIXED_PRECISION)
3086
3087#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
3088 ({ \
3089 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003090 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003091#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
3092 ({ \
3093 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
3094 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003095 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003096#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
3097 ({ \
3098 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
3099 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003100 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003101#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
3102 ({ \
3103 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
3104 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003105 })
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003106#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
3107 ({ \
3108 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
3109 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
3110 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
3111 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
3112 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003113 })
3114
3115// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
3116// a is the column-vector (transposed)
3117// b is the row-vector (not transposed)
3118// C is the output matrix
3119// Lower case is a vector (a, b)
3120// Upper case is a matrix (C)
3121#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
3122
3123#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
3124 ({ \
3125 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
3126 })
3127#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
3128 ({ \
3129 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
3130 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
3131 })
3132#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
3133 ({ \
3134 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
3135 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
3136 })
3137#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
3138 ({ \
3139 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
3140 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
3141 })
3142#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
3143 ({ \
3144 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
3145 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
3146 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
3147 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
3148 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
3149 })
3150#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
3151 ({ \
3152 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
3153 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
3154 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
3155 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
3156 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
3157 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
3158 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
3159 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
3160 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
3161 })
3162
3163// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
3164// The dimensions for this matrix multiplications are defined through M0, N0 and K0
3165// The dimensions supported are:
3166// M0: 1, 2, 3, 4, 8
3167// N0: 1, 2, 3, 4, 8, 16
3168// K0: 1, 2, 3, 4, 8, 16
3169// This macro calls the vector-by-matrix macro K0 times
3170// A, B and C are matrices
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003171#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
3172 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003173 (M0, N0, TYPE, A, B, C)
3174
3175/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
3176 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3177 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3178 *
3179 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
3180 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003181 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003182 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3183 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3184 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3185 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3186 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003187 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3188 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003189 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3190 * - M0 = 2, 3, 4, 8
3191 * - N0 = 2, 3, 4, 8, 16
3192 * - K0 = 2, 3, 4, 8, 16
3193 * - V0 >= 1
3194 * - H0 >= 1
3195 *
3196 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3197 * The activation function is performed after the bias addition
3198 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3199 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3200 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3201 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3202 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3203 *
3204 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
3205 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3206 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3207 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3208 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3209 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3210 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
3211 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
3212 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3213 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
3214 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3215 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
3216 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3217 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3218 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3219 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3220 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3221 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3222 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3223 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3224 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3225 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3226 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3227 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003228 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003229 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3230 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3231 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3232 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3233 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3234 */
3235__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
3236 IMAGE_DECLARATION(rhs),
3237#if defined(BETA)
3238 IMAGE_DECLARATION(bias),
3239#endif // defined(BETA)
3240 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003241 uint k,
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003242 uint lhs_stride_z,
3243 uint rhs_stride_z,
3244#if defined(BETA)
3245 uint bias_stride_z,
3246#endif //defined(BETA)
3247 uint dst_stride_z
3248#if defined(REINTERPRET_OUTPUT_AS_3D)
3249 ,
3250 uint dst_cross_plane_pad
3251#endif // REINTERPRET_OUTPUT_AS_3D
3252 )
3253{
3254 // Block size
3255#define LHS_BLOCK_SIZE ((K0) * (M0))
3256
3257#if defined(LHS_INTERLEAVE)
3258#define LHS_OFFSET_X (M0)
3259#define LHS_STEP_X ((M0) * (V0))
3260#define LHS_STEP_LOOP (1)
3261#else // defined(INTERLEAVE)
3262#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3263#define LHS_STEP_X (M0)
3264#define LHS_STEP_LOOP (V0)
3265#endif // defined(INTERLEAVE)
3266
3267 // Block size
3268#define RHS_BLOCK_SIZE ((K0) * (N0))
3269
3270 // RHS offset and step X
3271#if defined(RHS_INTERLEAVE)
3272#define RHS_OFFSET_X (N0)
3273#define RHS_STEP_X ((N0) * (H0))
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003274#else // defined(RHS_INTERLEAVE)
3275#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3276#define RHS_STEP_X (N0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003277#endif // defined(RHS_INTERLEAVE)
3278
3279 const uint x = get_global_id(0);
3280 const uint y = get_global_id(1);
3281 const uint z = get_global_id(2);
3282
3283#if defined(DUMMY_WORK_ITEMS)
3284 if((x * N0 >= N) || (y * M0 >= M))
3285 {
3286 return;
3287 }
3288#endif // defined(DUMMY_WORK_ITEMS)
3289
3290 // Compute LHS matrix address
3291 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3292
3293 // Compute RHS matrix address
3294 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
3295
3296#if defined(MATRIX_B_DEPTH)
3297 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3298 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
3299#else // defined(MATRIX_B_DEPTH)
3300 rhs_addr += z * rhs_stride_z;
3301#endif // defined(MATRIX_B_DEPTH)
3302
3303 // Initialize the accumulators
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003304 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003305
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003306 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3307
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003308 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3309 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
3310
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003311 for(int i = 0; i < k; i += K0)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003312 {
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003313 VEC_DATA_TYPE(DATA_TYPE, M0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003314 a0;
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003315 VEC_DATA_TYPE(DATA_TYPE, N0)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003316 b0;
3317
3318 a0 = VLOAD(M0)(0, lhs);
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003319 b0 = VLOAD(N0)(0, rhs);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003320
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003321 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003322
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003323 lhs += LHS_STEP_X;
3324 rhs += RHS_STEP_X;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003325
Gian Marco Iodice05639f62019-09-24 12:05:06 +01003326#if K0 > 1
3327 a0 = VLOAD(M0)(0, lhs);
3328 b0 = VLOAD(N0)(0, rhs);
3329
3330 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3331
3332 lhs += LHS_STEP_X;
3333 rhs += RHS_STEP_X;
3334#endif // K0 > 1
3335
3336#if K0 > 2
3337 a0 = VLOAD(M0)(0, lhs);
3338 b0 = VLOAD(N0)(0, rhs);
3339
3340 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3341
3342 lhs += LHS_STEP_X;
3343 rhs += RHS_STEP_X;
3344#endif // K0 > 2
3345
3346#if K0 > 3
3347 a0 = VLOAD(M0)(0, lhs);
3348 b0 = VLOAD(N0)(0, rhs);
3349
3350 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3351
3352 lhs += LHS_STEP_X;
3353 rhs += RHS_STEP_X;
3354#endif // K0 > 3
3355
3356#if K0 > 4
3357 a0 = VLOAD(M0)(0, lhs);
3358 b0 = VLOAD(N0)(0, rhs);
3359
3360 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3361
3362 lhs += LHS_STEP_X;
3363 rhs += RHS_STEP_X;
3364
3365 a0 = VLOAD(M0)(0, lhs);
3366 b0 = VLOAD(N0)(0, rhs);
3367
3368 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3369
3370 lhs += LHS_STEP_X;
3371 rhs += RHS_STEP_X;
3372
3373 a0 = VLOAD(M0)(0, lhs);
3374 b0 = VLOAD(N0)(0, rhs);
3375
3376 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3377
3378 lhs += LHS_STEP_X;
3379 rhs += RHS_STEP_X;
3380
3381 a0 = VLOAD(M0)(0, lhs);
3382 b0 = VLOAD(N0)(0, rhs);
3383
3384 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3385
3386 lhs += LHS_STEP_X;
3387 rhs += RHS_STEP_X;
3388#endif // K0 > 4
3389
3390#if K0 > 8
3391 a0 = VLOAD(M0)(0, lhs);
3392 b0 = VLOAD(N0)(0, rhs);
3393
3394 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3395
3396 lhs += LHS_STEP_X;
3397 rhs += RHS_STEP_X;
3398
3399 a0 = VLOAD(M0)(0, lhs);
3400 b0 = VLOAD(N0)(0, rhs);
3401
3402 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3403
3404 lhs += LHS_STEP_X;
3405 rhs += RHS_STEP_X;
3406
3407 a0 = VLOAD(M0)(0, lhs);
3408 b0 = VLOAD(N0)(0, rhs);
3409
3410 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3411
3412 lhs += LHS_STEP_X;
3413 rhs += RHS_STEP_X;
3414
3415 a0 = VLOAD(M0)(0, lhs);
3416 b0 = VLOAD(N0)(0, rhs);
3417
3418 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3419
3420 lhs += LHS_STEP_X;
3421 rhs += RHS_STEP_X;
3422
3423 a0 = VLOAD(M0)(0, lhs);
3424 b0 = VLOAD(N0)(0, rhs);
3425
3426 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3427
3428 lhs += LHS_STEP_X;
3429 rhs += RHS_STEP_X;
3430
3431 a0 = VLOAD(M0)(0, lhs);
3432 b0 = VLOAD(N0)(0, rhs);
3433
3434 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3435
3436 lhs += LHS_STEP_X;
3437 rhs += RHS_STEP_X;
3438
3439 a0 = VLOAD(M0)(0, lhs);
3440 b0 = VLOAD(N0)(0, rhs);
3441
3442 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3443
3444 lhs += LHS_STEP_X;
3445 rhs += RHS_STEP_X;
3446
3447 a0 = VLOAD(M0)(0, lhs);
3448 b0 = VLOAD(N0)(0, rhs);
3449
3450 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3451
3452 lhs += LHS_STEP_X;
3453 rhs += RHS_STEP_X;
3454#endif // K0 > 8
3455
3456#ifndef LHS_INTERLEAVE
3457 lhs += (M0 * K0 * (V0 - 1));
3458#endif // LHS_INTERLEAVE
3459
3460#ifndef RHS_INTERLEAVE
3461 rhs += (N0 * K0 * (H0 - 1));
3462#endif // RHS_INTERLEAVE
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003463 }
3464
3465 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3466
3467 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3468
3469#if defined(REINTERPRET_OUTPUT_AS_3D)
3470
3471 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00003472 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003473 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3474 // multiply dst_stride_z by DEPTH_GEMM3D
3475 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3476
3477#else // defined(REINTERPRET_OUTPUT_AS_3D)
3478
3479 // Add offset for batched GEMM
3480 dst_addr += z * dst_stride_z;
3481
3482#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3483
3484 // Multiply by the weight of matrix-matrix product and store the result
3485#if defined(ALPHA)
3486 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3487#endif // defined(ALPHA)
3488
3489 // Add beta*bias
3490#if defined(BETA)
3491#if defined(BROADCAST_BIAS)
3492 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3493
3494 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3495
3496#ifndef UNIT_BETA
3497 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3498#endif // UNIT_BIAS
3499
3500 // c = c + bias[broadcasted]
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003501#if defined(MIXED_PRECISION)
3502 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3503 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3504#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003505 ADD_BLOCK_BROADCAST(M0, c, bias0);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003506#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003507
3508#else // defined(BROADCAST_BIAS)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003509 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
3510 2) * bias_stride_z;
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003511
3512 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3513
3514#ifndef UNIT_BETA
3515 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3516#endif // UNIT_BIAS
3517
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003518#if defined(MIXED_PRECISION)
3519 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3520 ADD_BLOCK(M0, c, bias_hp);
3521#else // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003522 ADD_BLOCK(M0, c, bias);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003523#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003524
3525#endif // defined(BROADCAST_BIAS)
3526#endif // defined(BETA)
3527
3528#if defined(ACTIVATION_TYPE)
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003529#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003530 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003531#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003532 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Georgios Pinitasa07ce152019-10-11 17:38:50 +01003533#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003534#endif // defined(ACTIVATION_TYPE)
3535
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003536 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3537 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3538
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003539 // Store output block
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003540#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003541 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003542 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003543#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003544 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +01003545#endif // defined(MIXED_PRECISION)
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003546
3547#undef LHS_BLOCK_SIZE
3548#undef LHS_OFFSET_X
3549#undef LHS_STEP_X
3550#undef RHS_BLOCK_SIZE
3551#undef RHS_OFFSET_X
3552#undef RHS_STEP_X
3553}
3554
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003555#if defined(OPENCL_IMAGE_SUPPORT)
3556/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
3557 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
3558 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
3559 *
3560 * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
3561 * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003562 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
3563 * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
Gian Marco Iodice781cba72020-06-19 16:56:57 +01003564 * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
3565 * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
3566 * could be different from the value returned by get_image_height(rhs_img).
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003567 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
3568 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
3569 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
3570 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
3571 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003572 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
3573 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003574 * @note Only the following configurations of M0, N0 and K0 are currently supported:
3575 * - M0 = 2, 3, 4, 8
3576 * - N0 = 4, 8, 16
3577 * - K0 = 4, 8, 16
3578 * - V0 >= 1
3579 * - H0 >= 1
3580 *
3581 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
3582 * The activation function is performed after the bias addition
3583 * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
3584 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
3585 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
3586 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
3587 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
3588 *
3589 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32
3590 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
3591 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
3592 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
3593 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
3594 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
3595 * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
3596 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
3597 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
3598 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
3599 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
3600 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
3601 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
3602 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
3603 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
3604 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
3605 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
3606 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
3607 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003608 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003609 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
3610 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
3611 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
3612 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
3613 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
3614 */
3615__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
3616 __read_only image2d_t rhs_img,
3617#if defined(BETA)
3618 IMAGE_DECLARATION(bias),
3619#endif // defined(BETA)
3620 IMAGE_DECLARATION(dst),
Gian Marco Iodicee5563d92020-06-25 17:18:36 +01003621 uint k,
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003622 uint lhs_stride_z,
3623 uint rhs_stride_z,
3624#if defined(BETA)
3625 uint bias_stride_z,
3626#endif //defined(BETA)
3627 uint dst_stride_z
3628#if defined(REINTERPRET_OUTPUT_AS_3D)
3629 ,
3630 uint dst_cross_plane_pad
3631#endif // REINTERPRET_OUTPUT_AS_3D
3632 )
3633{
3634 // Pixel unit
3635#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
3636
3637 // Block size
3638#define LHS_BLOCK_SIZE ((K0) * (M0))
3639
3640#if defined(LHS_INTERLEAVE)
3641#define LHS_OFFSET_X (M0)
3642#define LHS_STEP_X ((M0) * (V0))
3643#define LHS_STEP_LOOP (1)
3644#else // defined(INTERLEAVE)
3645#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
3646#define LHS_STEP_X (M0)
3647#define LHS_STEP_LOOP (V0)
3648#endif // defined(INTERLEAVE)
3649
3650 // Block size
3651#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
3652
3653 // RHS offset and step X
3654#if defined(RHS_INTERLEAVE)
3655#define RHS_OFFSET_X (PIXEL_UNIT)
3656#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
3657#else // defined(RHS_INTERLEAVE)
3658#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
3659#define RHS_STEP_X (PIXEL_UNIT)
3660#endif // defined(RHS_INTERLEAVE)
3661
3662 const uint x = get_global_id(0);
3663 const uint y = get_global_id(1);
3664 const uint z = get_global_id(2);
3665
3666#if defined(DUMMY_WORK_ITEMS)
3667 if((x * N0 >= N) || (y * M0 >= M))
3668 {
3669 return;
3670 }
3671#endif // defined(DUMMY_WORK_ITEMS)
3672
3673 // Compute LHS matrix address
3674 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
3675
3676#if defined(MATRIX_B_DEPTH)
3677 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
3678 const uint z_rhs = (z % MATRIX_B_DEPTH);
3679#else // defined(MATRIX_B_DEPTH)
3680 const uint z_rhs = z;
3681#endif // defined(MATRIX_B_DEPTH)
3682
3683 // Compute RHS matrix coordinates
3684 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
3685 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
3686
3687 // Initialize the accumulators
3688 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
3689
3690 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
3691
3692 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
3693
3694 for(int i = 0; i < K; i += K0)
3695 {
3696 VEC_DATA_TYPE(DATA_TYPE, M0)
3697 a0;
3698 VEC_DATA_TYPE(DATA_TYPE, N0)
3699 b0;
3700
3701 a0 = VLOAD(M0)(0, lhs);
3702 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
3703
3704 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3705
3706 lhs += LHS_STEP_X;
3707
3708#if K0 > 1
3709 a0 = VLOAD(M0)(0, lhs);
3710 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
3711
3712 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3713
3714 lhs += LHS_STEP_X;
3715#endif // K0 > 1
3716
3717#if K0 > 2
3718 a0 = VLOAD(M0)(0, lhs);
3719 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
3720
3721 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3722
3723 lhs += LHS_STEP_X;
3724#endif // K0 > 2
3725
3726#if K0 > 3
3727 a0 = VLOAD(M0)(0, lhs);
3728 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
3729
3730 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3731
3732 lhs += LHS_STEP_X;
3733#endif // K0 > 3
3734
3735#if K0 > 4
3736 a0 = VLOAD(M0)(0, lhs);
3737 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
3738
3739 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3740
3741 lhs += LHS_STEP_X;
3742
3743 a0 = VLOAD(M0)(0, lhs);
3744 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
3745
3746 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3747
3748 lhs += LHS_STEP_X;
3749
3750 a0 = VLOAD(M0)(0, lhs);
3751 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
3752
3753 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3754
3755 lhs += LHS_STEP_X;
3756
3757 a0 = VLOAD(M0)(0, lhs);
3758 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
3759
3760 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3761
3762 lhs += LHS_STEP_X;
3763#endif // K0 > 4
3764
3765#if K0 > 8
3766 a0 = VLOAD(M0)(0, lhs);
3767 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
3768
3769 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3770
3771 lhs += LHS_STEP_X;
3772
3773 a0 = VLOAD(M0)(0, lhs);
3774 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
3775
3776 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3777
3778 lhs += LHS_STEP_X;
3779
3780 a0 = VLOAD(M0)(0, lhs);
3781 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
3782
3783 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3784
3785 lhs += LHS_STEP_X;
3786
3787 a0 = VLOAD(M0)(0, lhs);
3788 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
3789
3790 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3791
3792 lhs += LHS_STEP_X;
3793
3794 a0 = VLOAD(M0)(0, lhs);
3795 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
3796
3797 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3798
3799 lhs += LHS_STEP_X;
3800
3801 a0 = VLOAD(M0)(0, lhs);
3802 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
3803
3804 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3805
3806 lhs += LHS_STEP_X;
3807
3808 a0 = VLOAD(M0)(0, lhs);
3809 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
3810
3811 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3812
3813 lhs += LHS_STEP_X;
3814
3815 a0 = VLOAD(M0)(0, lhs);
3816 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
3817
3818 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
3819
3820 lhs += LHS_STEP_X;
3821#endif // K0 > 8
3822
3823#ifndef LHS_INTERLEAVE
3824 lhs += (M0 * K0 * (V0 - 1));
3825#endif // LHS_INTERLEAVE
3826
3827 x_rhs += K0 * RHS_STEP_X;
3828#ifndef RHS_INTERLEAVE
3829 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
3830#endif // RHS_INTERLEAVE
3831 }
3832
3833 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
3834
3835 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
3836
3837#if defined(REINTERPRET_OUTPUT_AS_3D)
3838
3839 // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Michele Di Giorgio5fa963f2020-11-23 15:05:12 +00003840 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003841 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
3842 // multiply dst_stride_z by DEPTH_GEMM3D
3843 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
3844
3845#else // defined(REINTERPRET_OUTPUT_AS_3D)
3846
3847 // Add offset for batched GEMM
3848 dst_addr += z * dst_stride_z;
3849
3850#endif // defined(REINTERPRET_OUTPUT_AS_3D)
3851
3852 // Multiply by the weight of matrix-matrix product and store the result
3853#if defined(ALPHA)
3854 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
3855#endif // defined(ALPHA)
3856
3857 // Add beta*bias
3858#if defined(BETA)
3859#if defined(BROADCAST_BIAS)
3860 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
3861
3862 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3863
3864#ifndef UNIT_BETA
3865 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
3866#endif // UNIT_BIAS
3867
3868 // c = c + bias[broadcasted]
3869#if defined(MIXED_PRECISION)
3870 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3871 ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
3872#else // defined(MIXED_PRECISION)
3873 ADD_BLOCK_BROADCAST(M0, c, bias0);
3874#endif // defined(MIXED_PRECISION)
3875
3876#else // defined(BROADCAST_BIAS)
3877 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
3878
3879 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
3880
3881#ifndef UNIT_BETA
3882 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
3883#endif // UNIT_BIAS
3884
3885#if defined(MIXED_PRECISION)
3886 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
3887 ADD_BLOCK(M0, c, bias_hp);
3888#else // defined(MIXED_PRECISION)
3889 ADD_BLOCK(M0, c, bias);
3890#endif // defined(MIXED_PRECISION)
3891
3892#endif // defined(BROADCAST_BIAS)
3893#endif // defined(BETA)
3894
3895#if defined(ACTIVATION_TYPE)
3896#if defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003897 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003898#else // defined(MIXED_PRECISION)
Giorgio Arenad056e572020-10-12 11:53:51 +01003899 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003900#endif // defined(MIXED_PRECISION)
3901#endif // defined(ACTIVATION_TYPE)
3902
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003903 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
3904 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
3905
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003906 // Store output block
3907#if defined(MIXED_PRECISION)
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01003908 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003909 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003910#else // defined(MIXED_PRECISION)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01003911 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +01003912#endif // defined(MIXED_PRECISION)
3913
3914#undef LHS_BLOCK_SIZE
3915#undef LHS_OFFSET_X
3916#undef LHS_STEP_X
3917#undef RHS_BLOCK_SIZE
3918#undef RHS_OFFSET_X
3919#undef RHS_STEP_X
3920#undef PIXEL_UNIT
3921#undef LHS_STEP_LOOP
3922#undef RHS_STEP_LOOP
3923}
3924#endif // defined(OPENCL_IMAGE_SUPPORT)
3925
Giorgio Arenaae99b6e2019-08-01 14:22:12 +01003926#endif // defined(LHS_TRANSPOSE)
3927
Gian Marco Iodicebf9731e2018-12-12 10:18:04 +00003928#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
3929
giuros01b3204e72019-04-01 13:50:22 +01003930#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
3931
3932#define VFMA(a, b, c) \
3933 ({ \
3934 c = fma(a, b, c); \
3935 })
3936
3937#if M0 == 1
3938#define RHS_VFMA_M0xN0(i, a, b, c) \
3939 ({ \
3940 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3941 })
3942#elif M0 == 2 // M0 == 2
3943#define RHS_VFMA_M0xN0(i, a, b, c) \
3944 ({ \
3945 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3946 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3947 })
3948#elif M0 == 3 // M0 == 3
3949#define RHS_VFMA_M0xN0(i, a, b, c) \
3950 ({ \
3951 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3952 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3953 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3954 })
3955#elif M0 == 4 // M0 == 4
3956#define RHS_VFMA_M0xN0(i, a, b, c) \
3957 ({ \
3958 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3959 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3960 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3961 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3962 })
3963#elif M0 == 5 // M0 == 5
3964#define RHS_VFMA_M0xN0(i, a, b, c) \
3965 ({ \
3966 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3967 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3968 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3969 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3970 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3971 })
3972#elif M0 == 6 // M0 == 6
3973#define RHS_VFMA_M0xN0(i, a, b, c) \
3974 ({ \
3975 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3976 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3977 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3978 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3979 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3980 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3981 })
3982#elif M0 == 7 // M0 == 7
3983#define RHS_VFMA_M0xN0(i, a, b, c) \
3984 ({ \
3985 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3986 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3987 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3988 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
3989 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
3990 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
3991 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
3992 })
3993#elif M0 == 8 // M0 == 8
3994#define RHS_VFMA_M0xN0(i, a, b, c) \
3995 ({ \
3996 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
3997 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
3998 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
3999 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
4000 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
4001 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
4002 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
4003 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
4004 })
4005#else // M0 not supported
4006#error "M0 not supported"
4007#endif // M0 not supported
4008
4009/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
4010 * The LHS matrix is NOT reshaped
4011 * The RHS matrix is NOT reshaped
4012 *
4013 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004014 * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
4015 * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
4016 * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
4017 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
4018 * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
SiCong Li3a501662020-06-26 10:02:06 +01004019 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
4020 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
giuros01b3204e72019-04-01 13:50:22 +01004021 * @note Only the following configurations of M0, N0 and K0 are currently supported:
4022 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
4023 * - N0 = 2, 3, 4, 8, 16
4024 * - K0 = 2, 3, 4, 8, 16
4025 *
Gian Marco Iodiced1f54762019-07-19 09:54:47 +01004026 * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004027 * The activation function is performed after the bias addition
giuros01b3204e72019-04-01 13:50:22 +01004028 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
4029 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
4030 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
4031 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
4032 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
4033 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
4034 *
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004035 * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32
4036 * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
4037 * @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)
4038 * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
4039 * @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)
4040 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
4041 * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
4042 * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
4043 * @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)
4044 * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
4045 * @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)
4046 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004047 * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
4048 * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)
4049 * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
4050 * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)
4051 * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
4052 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
4053 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
4054 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4055 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
4056 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4057 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
4058 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4059 * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
4060 * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
4061 * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)
4062 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4063 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
4064 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
giuros01b3204e72019-04-01 13:50:22 +01004065 */
4066__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
4067 IMAGE_DECLARATION(rhs),
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004068#if defined(BETA)
4069 IMAGE_DECLARATION(bias),
4070#endif // defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004071 IMAGE_DECLARATION(dst),
4072 uint lhs_stride_z,
4073 uint rhs_stride_z,
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004074#if defined(BETA)
4075 uint bias_stride_z,
4076#endif //defined(BETA)
giuros01b3204e72019-04-01 13:50:22 +01004077 uint dst_stride_z
4078#if defined(REINTERPRET_INPUT_AS_3D)
4079 ,
4080 uint lhs_cross_plane_pad
4081#endif // REINTERPRET_INPUT_AS_3D
4082#if defined(REINTERPRET_OUTPUT_AS_3D)
4083 ,
4084 uint dst_cross_plane_pad
4085#endif // REINTERPRET_OUTPUT_AS_3D
4086 )
4087{
4088 // Block size
4089#define RHS_BLOCK_SIZE ((K0) * (N0))
4090
4091 // RHS offset and step X
4092#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
4093
4094 uint x = get_global_id(0);
4095 uint y = get_global_id(1);
4096 uint z = get_global_id(2);
4097
4098#if defined(DUMMY_WORK_ITEMS)
4099 if((x * N0 >= N) || (y * M0 >= M))
4100 {
4101 return;
4102 }
4103#endif // defined(DUMMY_WORK_ITEMS)
4104
4105 // Compute LHS matrix address
SiCong Li406a13f2020-07-15 12:09:58 +01004106 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
giuros01b3204e72019-04-01 13:50:22 +01004107
4108 // Compute RHS matrix address
4109 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
4110
4111#if defined(MATRIX_B_DEPTH)
4112 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
4113 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
4114#else // defined(MATRIX_B_DEPTH)
4115 rhs_offset += z * rhs_stride_z;
4116#endif // defined(MATRIX_B_DEPTH)
4117
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004118 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
4119 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
giuros01b3204e72019-04-01 13:50:22 +01004120
4121#if defined(REINTERPRET_INPUT_AS_3D)
4122 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01004123 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004124
4125 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4126 // multiply lhs_stride_z by DEPTH_GEMM3D
4127 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
4128
4129#else // defined(REINTERPRET_INPUT_AS_3D)
4130
4131 // Add offset for batched GEMM
4132 lhs_offset += z * lhs_stride_z;
4133
4134#endif // defined(REINTERPRET_INPUT_AS_3D)
4135
4136 // Initialize the accumulators
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004137 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
giuros01b3204e72019-04-01 13:50:22 +01004138
4139 int i = 0;
4140 for(; i <= (K - K0); i += K0)
4141 {
4142 // Supported cases (M0, K0):
4143 // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
4144 // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
4145 // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
4146 // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
4147 // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
4148 // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
4149 // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
4150 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
4151 // Load values from LHS matrix
4152 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
4153
4154 // Load values from RHS matrix
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004155 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
giuros01b3204e72019-04-01 13:50:22 +01004156
4157 RHS_VFMA_M0xN0(0, a, b0, c);
4158 RHS_VFMA_M0xN0(1, a, b1, c);
4159#if K0 > 2
4160 RHS_VFMA_M0xN0(2, a, b2, c);
4161#endif // K0 > 2
4162#if K0 > 3
4163 RHS_VFMA_M0xN0(3, a, b3, c);
4164#endif // K0 > 3
4165#if K0 > 4
4166 RHS_VFMA_M0xN0(4, a, b4, c);
4167 RHS_VFMA_M0xN0(5, a, b5, c);
4168 RHS_VFMA_M0xN0(6, a, b6, c);
4169 RHS_VFMA_M0xN0(7, a, b7, c);
4170#endif // K0 > 4
4171#if K0 > 8
4172 RHS_VFMA_M0xN0(8, a, b8, c);
4173 RHS_VFMA_M0xN0(9, a, b9, c);
Gian Marco Iodice7b9d7ca2019-09-19 16:37:39 +01004174 RHS_VFMA_M0xN0(A, a, bA, c);
4175 RHS_VFMA_M0xN0(B, a, bB, c);
4176 RHS_VFMA_M0xN0(C, a, bC, c);
4177 RHS_VFMA_M0xN0(D, a, bD, c);
4178 RHS_VFMA_M0xN0(E, a, bE, c);
4179 RHS_VFMA_M0xN0(F, a, bF, c);
giuros01b3204e72019-04-01 13:50:22 +01004180#endif // K0 > 8
4181
4182 lhs_offset += K0 * sizeof(DATA_TYPE);
4183 rhs_offset += K0 * rhs_stride_y;
4184 }
4185
4186 // Left-over accumulations
4187 for(; i < K; ++i)
4188 {
4189 // Load values from LHS matrix
4190 VEC_DATA_TYPE(DATA_TYPE, 2)
4191 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
4192#if M0 > 1
4193 VEC_DATA_TYPE(DATA_TYPE, 2)
4194 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
4195#endif // M0 > 1
4196#if M0 > 2
4197 VEC_DATA_TYPE(DATA_TYPE, 2)
4198 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
4199#endif // M0 > 2
4200#if M0 > 3
4201 VEC_DATA_TYPE(DATA_TYPE, 2)
4202 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
4203#endif // M0 > 3
4204#if M0 > 4
4205 VEC_DATA_TYPE(DATA_TYPE, 2)
4206 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
4207#endif // M0 > 4
4208#if M0 > 5
4209 VEC_DATA_TYPE(DATA_TYPE, 2)
4210 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
4211#endif // M0 > 5
4212#if M0 > 6
4213 VEC_DATA_TYPE(DATA_TYPE, 2)
4214 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
4215#endif // M0 > 6
4216#if M0 > 7
4217 VEC_DATA_TYPE(DATA_TYPE, 2)
4218 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
4219#endif // M0 > 7
4220
4221 VEC_DATA_TYPE(DATA_TYPE, N0)
4222 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
4223 RHS_VFMA_M0xN0(0, a, b, c);
4224
4225 lhs_offset += sizeof(DATA_TYPE);
4226 rhs_offset += rhs_stride_y;
4227 }
4228
SiCong Li406a13f2020-07-15 12:09:58 +01004229 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004230
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004231 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
giuros01b3204e72019-04-01 13:50:22 +01004232
4233#if defined(REINTERPRET_OUTPUT_AS_3D)
4234 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice9ae06d42020-10-22 16:37:12 +01004235 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
giuros01b3204e72019-04-01 13:50:22 +01004236
4237 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
4238 // multiply dst_stride_z by DEPTH_GEMM3D
4239 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
4240
4241#else // defined(REINTERPRET_OUTPUT_AS_3D)
4242
4243 // Add offset for batched GEMM
4244 dst_addr += z * dst_stride_z;
4245
4246#endif // defined(REINTERPRET_OUTPUT_AS_3D)
4247
4248 // Multiply by the weight of matrix-matrix product and store the result
giuros01b3204e72019-04-01 13:50:22 +01004249#if defined(ALPHA)
4250 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
4251#endif // defined(ALPHA)
4252
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004253 // Add beta*bias
4254#if defined(BETA)
4255#if defined(BROADCAST_BIAS)
4256 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
4257
4258 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4259
4260#ifndef UNIT_BETA
4261 SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
4262#endif // UNIT_BIAS
4263
4264 // c = c + bias[broadcasted]
4265 ADD_BLOCK_BROADCAST(M0, c, bias0);
4266
4267#else // defined(BROADCAST_BIAS)
SiCong Li406a13f2020-07-15 12:09:58 +01004268 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
Gian Marco Iodice944170e2019-06-24 14:40:30 +01004269
4270 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
4271
4272#ifndef UNIT_BETA
4273 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
4274#endif // UNIT_BIAS
4275
4276 // c = c + bias
4277 ADD_BLOCK(M0, c, bias);
4278
4279#endif // defined(BROADCAST_BIAS)
4280#endif // defined(BETA)
4281
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004282#if defined(ACTIVATION_TYPE)
Giorgio Arenad056e572020-10-12 11:53:51 +01004283 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
Gian Marco Iodiceca1f4602019-07-16 15:46:48 +01004284#endif // defined(ACTIVATION_TYPE)
4285
Gian Marco Iodice088d63a2020-08-11 14:14:06 +01004286 const bool cond_y = y == 0;
4287 const bool cond_x = ((x + 1) * N0 >= N);
4288
giuros01b3204e72019-04-01 13:50:22 +01004289 // Store output block
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01004290 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
giuros01b3204e72019-04-01 13:50:22 +01004291
4292#undef RHS_BLOCK_SIZE
4293#undef RHS_OFFSET_X
4294#undef RHS_STEP_X
4295}
4296#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
4297
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004298#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004299/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
4300 *
Gian Marco19835e52018-01-30 13:35:54 +00004301 * @note The beta's value need to be passed at compile time using -DBETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004302 *
4303 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
4304 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
4305 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4306 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
4307 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004308 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
4309 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004310 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004311 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004312 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4313 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
4314 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4315 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004316 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4317 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004318 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4319 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004320__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
4321 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004322{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004323 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004324 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
4325 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004326
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004327 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004328 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
4329
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004330 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004331 float4 c = vload4(0, (__global float *)src.ptr);
4332
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004333 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004334 float4 out = alpha_ab + (float4)BETA * c;
4335
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004336 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004337 vstore4(out, 0, (__global float *)dst.ptr);
4338}
4339
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01004340#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004341/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
4342 *
Gian Marco19835e52018-01-30 13:35:54 +00004343 * @note The beta's value need to be passed at compile time using -DBETA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004344 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004345 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
4346 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
4347 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
4348 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
4349 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004350 * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
4351 * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004352 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01004353 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004354 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
4355 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
4356 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
4357 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004358 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
4359 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004360 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
4361 */
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004362__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
4363 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004364{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004365 // Compute source and destination addresses
Isabella Gottardi8e74f442018-03-01 16:42:00 +00004366 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
4367 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004368
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004369 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004370 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
4371
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004372 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004373 half8 c = vload8(0, (__global half *)src.ptr);
4374
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004375 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004376 half8 out = alpha_ab + (half8)BETA * c;
4377
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00004378 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01004379 vstore8(out, 0, (__global half *)dst.ptr);
4380}
Vidhya Sudhan Loganathan76c85642018-05-25 13:53:02 +01004381#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Georgios Pinitas96b16b62020-12-01 17:41:34 +00004382#endif // defined(BETA)