blob: 9bec8d5d92a64cca84017b58b4dcb9a84b0c0251 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
27 *
28 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
29 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
30 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
31 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
32 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
33 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
34 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
35 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
36 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
37 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
38 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
39 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
40 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010041__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
42 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043{
44 uint x = get_global_id(0);
45 uint y = get_global_id(1);
46
47 /* Compute address for Matrix B - source */
48 Image src = CONVERT_TO_IMAGE_STRUCT(src);
49
50 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
51 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
52
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010053 uint4 b0 = vload4(0, (__global uint *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010055 vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010056}
57
58/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
59 *
60 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
61 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
62 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
63 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
64 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
65 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
66 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
67 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
68 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
69 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
70 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
71 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
72 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010073__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
74 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075{
76 uint x = get_global_id(0);
77 uint y = get_global_id(1);
78
79 /* Compute address for Matrix B - source */
80 Image src = CONVERT_TO_IMAGE_STRUCT(src);
81
82 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
83 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
84
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010085 ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010086
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010087 vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010088}
89
90/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
91 *
92 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8
93 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
94 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
96 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
98 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8
99 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
100 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
102 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
104 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +0100105__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
106 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100107{
108 uint x = get_global_id(0);
109 uint y = get_global_id(1);
110
111 /* Compute address for Matrix B - source */
112 Image src = CONVERT_TO_IMAGE_STRUCT(src);
113
114 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
115 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
116
117 uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
118
119 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
120}
121
122/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
123 *
124 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
125 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
126 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
128 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
130 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U32/S32/F32
131 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
132 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
133 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
134 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
135 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
136 */
137__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
138 IMAGE_DECLARATION(dst))
139{
140 /* Compute source and destination addresses */
141 Image src = CONVERT_TO_IMAGE_STRUCT(src);
142 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
143
144 /* Load values from Matrix A */
145 float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
146 float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
147 float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
148 float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
149
150 float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
151 vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
152
153 val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
154 vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
155
156 val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
157 vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
158
159 val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
160 vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
161}
162
163/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
164 *
165 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/F16
166 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
167 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
169 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
171 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U16/S16/F16
172 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
173 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
174 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
175 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
176 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
177 */
178__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
179 IMAGE_DECLARATION(dst))
180{
181 /* Compute source and destination addresses */
182 Image src = CONVERT_TO_IMAGE_STRUCT(src);
183 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
184
185 /* Load values from Matrix A */
186 half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
187 half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
188 half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
189 half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
190
191 half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
192 vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
193
194 val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
195 vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
196
197 val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
198 vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
199
200 val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
201 vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
202}
203
204/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
205 *
206 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8
207 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
208 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
209 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
210 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
211 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
212 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8/S8
213 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
214 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
215 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
216 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
217 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
218 */
219__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
220 IMAGE_DECLARATION(dst))
221{
222 /* Compute source and destination addresses */
223 Image src = CONVERT_TO_IMAGE_STRUCT(src);
224 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
225
226 /* Load values from Matrix A */
227 uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
228 uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
229 uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
230 uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
231
232 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
233 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
234 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
235
236 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
237 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
238 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
239
240 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
241 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
242 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
243
244 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
245 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
246 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
247}
248
249/** This kernel accumulates each row with the biases vector
250 *
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100251 * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
252 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100253 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
254 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
255 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
256 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
257 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
258 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
259 * @param[in] biases_ptr Pointer to the biases vector. Same as input.
260 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
261 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
262 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
263 */
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100264#if(defined DATA_TYPE)
265__kernel void gemm_accumulate_biases(
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100266 IMAGE_DECLARATION(accum),
267 VECTOR_DECLARATION(biases))
268{
269 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
270 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
271
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100272 VEC_DATA_TYPE(DATA_TYPE, 16)
273 accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
274 VEC_DATA_TYPE(DATA_TYPE, 16)
275 biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
276 accum_value = biases_value + accum_value;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277
278 // Store result in the accummulate buffer
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100279 vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100280}
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100281#endif // defined DATA_TYPE
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100282
283#if(defined WIDTH_MATRIX_B)
284/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
285 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
286 *
287 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
288 *
289 * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
290 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
291 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
292 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
293 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
294 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
295 * @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8
296 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
297 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
298 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
299 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
300 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
301 * @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8
302 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
303 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
304 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
305 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
306 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
307 * @param[in] a_offset Offset to be added to each element of the matrix A
308 * @param[in] b_offset Offset to be added to each element of the matrix B.
309 * @param[in] c_offset Offset to be added to each element of the matrix C.
310 * @param[in] c_mult_int Multiplied with each element of the matrix C.
311 * @param[in] shift Number of bits to shift right the result.
312 */
313__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
314 IMAGE_DECLARATION(src1),
315 IMAGE_DECLARATION(dst),
316 int a_offset,
317 int b_offset,
318 int c_offset,
319 int c_mult_int,
320 int shift)
321{
322 /* src_addr.s0 = address of matrix A */
323 /* src_addr.s1 = address of matrix B */
324
325 /* Compute address for matrix A and B */
326 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
327 (src1_stride_y));
328
329 /* Add offset_first_element_in_bytes */
330 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
331
332 /* Compute end row address for matrix B */
333 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
334
335 /* Reset accumulators */
336 int16 c00 = 0.0f;
337 int16 c10 = 0.0f;
338 int16 c20 = 0.0f;
339 int16 c30 = 0.0f;
340
341 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
342 {
343 /* Load values from matrix A (interleaved) and matrix B (transposed) */
344 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
345 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
346
347 c00 += (int16)a0.s0 * b0;
348 c10 += (int16)a0.s1 * b0;
349 c20 += (int16)a0.s2 * b0;
350 c30 += (int16)a0.s3 * b0;
351
352 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
353
354 c00 += (int16)a0.s4 * b1;
355 c10 += (int16)a0.s5 * b1;
356 c20 += (int16)a0.s6 * b1;
357 c30 += (int16)a0.s7 * b1;
358 }
359
360 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
361 {
362 /* Load values from matrix A (interleaved) and matrix B (transposed) */
363 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
364 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
365
366 c00 += (int16)a0.s0 * b0;
367 c10 += (int16)a0.s1 * b0;
368 c20 += (int16)a0.s2 * b0;
369 c30 += (int16)a0.s3 * b0;
370 }
371
372 /* Compute destination address */
373 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
374
375 /* Multiply by the weight of matrix product */
376 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
377 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
378 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
379 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
380
381 /* Store 4x16 block */
382 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
383 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
384 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
385 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
386}
387#endif
388
389#if(defined WIDTH_MATRIX_B && defined ALPHA)
390/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
391 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
392 *
393 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
394 *
395 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
396 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
397 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
398 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
399 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
400 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
401 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
402 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
403 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
404 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
405 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
406 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
407 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
408 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
409 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
410 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
411 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
412 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
413 */
414__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
415 IMAGE_DECLARATION(src1),
416 IMAGE_DECLARATION(dst))
417{
418 /* src_addr.s0 = address of matrix A */
419 /* src_addr.s1 = address of matrix B */
420
421 /* Compute address for matrix A and B */
422 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
423 (src1_stride_y));
424
425 /* Add offset_first_element_in_bytes */
426 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
427
428 /* Divide by 4 in order to get the src_addr in unit of float */
429 src_addr = src_addr >> 2;
430
431 /* Compute end row address for matrix B */
432 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
433
434 /* Reset accumulators */
435 float4 c00 = 0.0f;
436 float4 c10 = 0.0f;
437 float4 c20 = 0.0f;
438 float4 c30 = 0.0f;
439
440 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
441 {
442 /* Load values from matrix A (interleaved) and matrix B (transposed) */
443 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
444 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
445
446 c00 += (float4)a0.s0 * b0;
447 c10 += (float4)a0.s1 * b0;
448 c20 += (float4)a0.s2 * b0;
449 c30 += (float4)a0.s3 * b0;
450
451 /* Load values from matrix A (interleaved) and matrix B (transposed) */
452 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
453 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
454
455 c00 += (float4)a0.s0 * b0;
456 c10 += (float4)a0.s1 * b0;
457 c20 += (float4)a0.s2 * b0;
458 c30 += (float4)a0.s3 * b0;
459 }
460
461 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
462 {
463 /* Load values from matrix A (interleaved) and matrix B (transposed) */
464 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
465 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
466
467 c00 += (float4)a0.s0 * b0;
468 c10 += (float4)a0.s1 * b0;
469 c20 += (float4)a0.s2 * b0;
470 c30 += (float4)a0.s3 * b0;
471 }
472
473 /* Compute destination address */
474 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
475
476 /* Multiply by the weight of matrix product */
477 c00 = c00 * (float4)ALPHA;
478 c10 = c10 * (float4)ALPHA;
479 c20 = c20 * (float4)ALPHA;
480 c30 = c30 * (float4)ALPHA;
481
482 /* Store 4x4 block */
483 vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
484 vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
485 vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
486 vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
487}
488
489/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
490 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
491 *
492 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
493 *
494 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
495 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
496 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
497 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
498 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
499 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
500 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
501 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
502 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
503 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
504 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
505 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
506 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
507 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
508 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
509 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
510 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
511 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
512 */
513__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
514 IMAGE_DECLARATION(src1),
515 IMAGE_DECLARATION(dst))
516{
517 // src_addr_a = address of matrix A
518 // src_addr_b = address of matrix B
519 __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
520 __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
521
522 // Compute end row address for matrix B
523 __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
524
525 // Reset accumulators
526 float c00 = 0.0f;
527 float c01 = 0.0f;
528 float c02 = 0.0f;
529 float c03 = 0.0f;
530 float c10 = 0.0f;
531 float c11 = 0.0f;
532 float c12 = 0.0f;
533 float c13 = 0.0f;
534 float c20 = 0.0f;
535 float c21 = 0.0f;
536 float c22 = 0.0f;
537 float c23 = 0.0f;
538 float c30 = 0.0f;
539 float c31 = 0.0f;
540 float c32 = 0.0f;
541 float c33 = 0.0f;
542
543 for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
544 {
545 // Load values from matrix A (interleaved) and matrix B (transposed)
546 float4 a0 = vload4(0, src_addr_a);
547 float4 b0 = vload4(0, src_addr_b);
548
549 c00 = fma(a0.s0, b0.s0, c00);
550 c01 = fma(a0.s0, b0.s1, c01);
551 c02 = fma(a0.s0, b0.s2, c02);
552 c03 = fma(a0.s0, b0.s3, c03);
553
554 c10 = fma(a0.s1, b0.s0, c10);
555 c11 = fma(a0.s1, b0.s1, c11);
556 c12 = fma(a0.s1, b0.s2, c12);
557 c13 = fma(a0.s1, b0.s3, c13);
558
559 c20 = fma(a0.s2, b0.s0, c20);
560 c21 = fma(a0.s2, b0.s1, c21);
561 c22 = fma(a0.s2, b0.s2, c22);
562 c23 = fma(a0.s2, b0.s3, c23);
563
564 c30 = fma(a0.s3, b0.s0, c30);
565 c31 = fma(a0.s3, b0.s1, c31);
566 c32 = fma(a0.s3, b0.s2, c32);
567 c33 = fma(a0.s3, b0.s3, c33);
568
569 // Load values from matrix A (interleaved) and matrix B (transposed)
570 a0 = vload4(0, src_addr_a + 4);
571 b0 = vload4(0, src_addr_b + 4);
572
573 c00 = fma(a0.s0, b0.s0, c00);
574 c01 = fma(a0.s0, b0.s1, c01);
575 c02 = fma(a0.s0, b0.s2, c02);
576 c03 = fma(a0.s0, b0.s3, c03);
577
578 c10 = fma(a0.s1, b0.s0, c10);
579 c11 = fma(a0.s1, b0.s1, c11);
580 c12 = fma(a0.s1, b0.s2, c12);
581 c13 = fma(a0.s1, b0.s3, c13);
582
583 c20 = fma(a0.s2, b0.s0, c20);
584 c21 = fma(a0.s2, b0.s1, c21);
585 c22 = fma(a0.s2, b0.s2, c22);
586 c23 = fma(a0.s2, b0.s3, c23);
587
588 c30 = fma(a0.s3, b0.s0, c30);
589 c31 = fma(a0.s3, b0.s1, c31);
590 c32 = fma(a0.s3, b0.s2, c32);
591 c33 = fma(a0.s3, b0.s3, c33);
592
593 // Load values from matrix A (interleaved) and matrix B (transposed)
594 a0 = vload4(0, src_addr_a + 8);
595 b0 = vload4(0, src_addr_b + 8);
596
597 c00 = fma(a0.s0, b0.s0, c00);
598 c01 = fma(a0.s0, b0.s1, c01);
599 c02 = fma(a0.s0, b0.s2, c02);
600 c03 = fma(a0.s0, b0.s3, c03);
601
602 c10 = fma(a0.s1, b0.s0, c10);
603 c11 = fma(a0.s1, b0.s1, c11);
604 c12 = fma(a0.s1, b0.s2, c12);
605 c13 = fma(a0.s1, b0.s3, c13);
606
607 c20 = fma(a0.s2, b0.s0, c20);
608 c21 = fma(a0.s2, b0.s1, c21);
609 c22 = fma(a0.s2, b0.s2, c22);
610 c23 = fma(a0.s2, b0.s3, c23);
611
612 c30 = fma(a0.s3, b0.s0, c30);
613 c31 = fma(a0.s3, b0.s1, c31);
614 c32 = fma(a0.s3, b0.s2, c32);
615 c33 = fma(a0.s3, b0.s3, c33);
616
617 // Load values from matrix A (interleaved) and matrix B (transposed)
618 a0 = vload4(0, src_addr_a + 12);
619 b0 = vload4(0, src_addr_b + 12);
620
621 c00 = fma(a0.s0, b0.s0, c00);
622 c01 = fma(a0.s0, b0.s1, c01);
623 c02 = fma(a0.s0, b0.s2, c02);
624 c03 = fma(a0.s0, b0.s3, c03);
625
626 c10 = fma(a0.s1, b0.s0, c10);
627 c11 = fma(a0.s1, b0.s1, c11);
628 c12 = fma(a0.s1, b0.s2, c12);
629 c13 = fma(a0.s1, b0.s3, c13);
630
631 c20 = fma(a0.s2, b0.s0, c20);
632 c21 = fma(a0.s2, b0.s1, c21);
633 c22 = fma(a0.s2, b0.s2, c22);
634 c23 = fma(a0.s2, b0.s3, c23);
635
636 c30 = fma(a0.s3, b0.s0, c30);
637 c31 = fma(a0.s3, b0.s1, c31);
638 c32 = fma(a0.s3, b0.s2, c32);
639 c33 = fma(a0.s3, b0.s3, c33);
640 }
641
642 for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
643 {
644 // Load values from matrix A (interleaved) and matrix B (transposed)
645 float4 a0 = vload4(0, src_addr_a);
646 float4 b0 = vload4(0, src_addr_b);
647
648 c00 = fma(a0.s0, b0.s0, c00);
649 c01 = fma(a0.s0, b0.s1, c01);
650 c02 = fma(a0.s0, b0.s2, c02);
651 c03 = fma(a0.s0, b0.s3, c03);
652
653 c10 = fma(a0.s1, b0.s0, c10);
654 c11 = fma(a0.s1, b0.s1, c11);
655 c12 = fma(a0.s1, b0.s2, c12);
656 c13 = fma(a0.s1, b0.s3, c13);
657
658 c20 = fma(a0.s2, b0.s0, c20);
659 c21 = fma(a0.s2, b0.s1, c21);
660 c22 = fma(a0.s2, b0.s2, c22);
661 c23 = fma(a0.s2, b0.s3, c23);
662
663 c30 = fma(a0.s3, b0.s0, c30);
664 c31 = fma(a0.s3, b0.s1, c31);
665 c32 = fma(a0.s3, b0.s2, c32);
666 c33 = fma(a0.s3, b0.s3, c33);
667 }
668
669 // Compute destination address
670 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
671
672 // Multiply by the weight of matrix product
673 c00 = c00 * ALPHA;
674 c01 = c01 * ALPHA;
675 c02 = c02 * ALPHA;
676 c03 = c03 * ALPHA;
677 c10 = c10 * ALPHA;
678 c11 = c11 * ALPHA;
679 c12 = c12 * ALPHA;
680 c13 = c13 * ALPHA;
681 c20 = c20 * ALPHA;
682 c21 = c21 * ALPHA;
683 c22 = c22 * ALPHA;
684 c23 = c23 * ALPHA;
685 c30 = c30 * ALPHA;
686 c31 = c31 * ALPHA;
687 c32 = c32 * ALPHA;
688 c33 = c33 * ALPHA;
689
690 barrier(CLK_GLOBAL_MEM_FENCE);
691
692 // Store 4x4 block
693 vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
694 vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
695 vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
696 vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
697}
698
699/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
700 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
701 *
702 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
703 *
704 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
705 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
706 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
707 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
708 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
709 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
710 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
711 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
712 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
713 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
714 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
715 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
716 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
717 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
718 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
719 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
720 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
721 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
722 */
723__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
724 IMAGE_DECLARATION(src1),
725 IMAGE_DECLARATION(dst))
726{
727 /* src_addr.s0 = address of matrix A */
728 /* src_addr.s1 = address of matrix B */
729
730 /* Compute address for matrix A and B */
731 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
732 (src1_stride_y));
733
734 /* Add offset_first_element_in_bytes */
735 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
736
737 /* Divide by 2 in order to get the src_addr in unit of half */
738 src_addr = src_addr >> 1;
739
740 /* Compute end row address for matrix B */
741 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
742
743 /* Reset accumulators */
744 half8 c00 = 0.0f;
745 half8 c10 = 0.0f;
746 half8 c20 = 0.0f;
747 half8 c30 = 0.0f;
748
749 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
750 {
751 /* Load values from matrix A (interleaved) and matrix B (transposed) */
752 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
753 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
754
755 c00 += (half8)a0.s0 * b0;
756 c10 += (half8)a0.s1 * b0;
757 c20 += (half8)a0.s2 * b0;
758 c30 += (half8)a0.s3 * b0;
759
760 /* Load values from matrix A (interleaved) and matrix B (transposed) */
761 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
762 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
763
764 c00 += (half8)a0.s0 * b0;
765 c10 += (half8)a0.s1 * b0;
766 c20 += (half8)a0.s2 * b0;
767 c30 += (half8)a0.s3 * b0;
768 }
769
770 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
771 {
772 /* Load values from matrix A (interleaved) and matrix B (transposed) */
773 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
774 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
775
776 c00 += (half8)a0.s0 * b0;
777 c10 += (half8)a0.s1 * b0;
778 c20 += (half8)a0.s2 * b0;
779 c30 += (half8)a0.s3 * b0;
780 }
781
782 /* Compute destination address */
783 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
784
785 /* Multiply by the weight of matrix product */
786 c00 = c00 * (half8)ALPHA;
787 c10 = c10 * (half8)ALPHA;
788 c20 = c20 * (half8)ALPHA;
789 c30 = c30 * (half8)ALPHA;
790
791 /* Store 4x8 block */
792 vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
793 vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
794 vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
795 vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
796}
797
798#if(defined WIDTH_VECTOR_A)
799/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
800 *
801 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
802 *
803 * @attention The input vector A and matrix B must not be reshaped
804 *
805 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
806 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
807 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
808 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
809 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
810 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
811 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
812 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
813 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
814 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
815 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
816 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
817 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
818 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
819 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
820 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
821 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
822 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
823 */
824__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
825 IMAGE_DECLARATION(src1),
826 IMAGE_DECLARATION(dst))
827{
828 int idx = get_global_id(0) * 4;
829
830 /* Compute the address for the vector A and matrix B */
831 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
832 src_addr.s1 += idx * sizeof(float);
833
834 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
835
836 float4 acc = 0.0f;
837
838 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
839 {
840 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
841 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
842 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
843
844 acc += b0 * (float4)a0.s0;
845 acc += b1 * (float4)a0.s1;
846 }
847
848 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
849 {
850 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
851 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
852
853 acc += b0 * (float4)a0;
854 }
855
856 /* Compute destination address */
857 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
858
859 /* Multiply by the weight of vector-matrix product */
860 acc = acc * (float4)ALPHA;
861
862 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
863}
864
865/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
866 *
867 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
868 *
869 * @attention The input vector A and matrix B must not be reshaped
870 *
871 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
872 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
873 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
874 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
875 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
876 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
877 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
878 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
879 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
880 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
881 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
882 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
883 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
884 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
885 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
886 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
887 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
888 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
889 */
890__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
891 IMAGE_DECLARATION(src1),
892 IMAGE_DECLARATION(dst))
893{
894 int idx = get_global_id(0) * 8;
895
896 /* Compute the address for the vector A and matrix B */
897 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
898 src_addr.s1 += idx * sizeof(half);
899
900 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
901
902 half8 acc = 0.0f;
903
904 for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
905 {
906 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
907 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
908 half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
909 half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
910 half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
911
912 acc += b0 * (half8)a0.s0;
913 acc += b1 * (half8)a0.s1;
914 acc += b2 * (half8)a0.s2;
915 acc += b3 * (half8)a0.s3;
916 }
917
918 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
919 {
920 half a0 = *((__global half *)(src0_ptr + src_addr.s0));
921 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
922
923 acc += b0 * (half8)a0;
924 }
925
926 /* Compute destination address */
927 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
928
929 /* Multiply by the weight of vector-matrix product */
930 acc = acc * (half8)ALPHA;
931
932 vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
933}
934#endif /* (defined WIDTH_VECTOR_A) */
935#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
936
937#if(defined BETA)
938/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
939 *
940 * @attention The beta's value need to be passed at compile time using -DBETA
941 *
942 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
943 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
944 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
945 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
946 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
947 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
948 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
949 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
950 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
951 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
952 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
953 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
954 */
955__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
956 IMAGE_DECLARATION(dst))
957{
958 /* Compute source and destination addresses */
959 Image src = CONVERT_TO_IMAGE_STRUCT(src);
960 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
961
962 /* Load values from A x B */
963 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
964
965 /* Load values from Matrix C */
966 float4 c = vload4(0, (__global float *)src.ptr);
967
968 /* Computes alpha * axb + beta * c */
969 float4 out = alpha_ab + (float4)BETA * c;
970
971 /* Store final result in axb matrix */
972 vstore4(out, 0, (__global float *)dst.ptr);
973}
974
975/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
976 *
977 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
978 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
979 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
980 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
981 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
982 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
983 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
984 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
985 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
986 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
987 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
988 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
989 */
990__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
991 IMAGE_DECLARATION(dst))
992{
993 /* Compute source and destination addresses */
994 Image src = CONVERT_TO_IMAGE_STRUCT(src);
995 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
996
997 /* Load values from A x B */
998 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
999
1000 /* Load values from Matrix C */
1001 half8 c = vload8(0, (__global half *)src.ptr);
1002
1003 /* Computes alpha * axb + beta * c */
1004 half8 out = alpha_ab + (half8)BETA * c;
1005
1006 /* Store final result in axb matrix */
1007 vstore8(out, 0, (__global half *)dst.ptr);
1008}
1009#endif /* (defined BETA) */
1010
1011#if(defined WIDTH_VECTOR_A)
1012/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
1013 *
1014 * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
1015 *
1016 * @attention The input A and matrix B must not be reshaped
1017 *
1018 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
1019 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1020 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1021 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1022 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1023 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1024 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
1025 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1026 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1027 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1028 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1029 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1030 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1031 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1032 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
1033 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1034 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1035 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1036 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1037 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1038 */
1039__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
1040 TENSOR3D_DECLARATION(src1),
1041 IMAGE_DECLARATION(dst))
1042{
1043 int idx = get_global_id(0) * 4;
1044 int idy = get_global_id(1);
1045
1046 /* Compute the address for the vector A and matrix B */
1047 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
1048 src_addr.s1 += idx * sizeof(float);
1049
1050 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
1051
1052 float4 acc = 0.0f;
1053
1054 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
1055 {
1056 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
1057 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1058 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
1059
1060 acc += b0 * (float4)a0.s0;
1061 acc += b1 * (float4)a0.s1;
1062 }
1063
1064 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
1065 {
1066 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
1067 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1068
1069 acc += b0 * (float4)a0;
1070 }
1071
1072 /* Compute destination address */
1073 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1074
1075 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
1076}
1077#endif /* (defined WIDTH_VECTOR_A) */