blob: caf6e3ffd8b2785c71036352d0ff2584cd796278 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
27 *
28 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
29 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
30 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
31 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
32 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
33 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
34 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
35 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
36 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
37 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
38 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
39 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
40 */
41__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
42 IMAGE_DECLARATION(dst))
43{
44 uint x = get_global_id(0);
45 uint y = get_global_id(1);
46
47 /* Compute address for Matrix B - source */
48 Image src = CONVERT_TO_IMAGE_STRUCT(src);
49
50 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
51 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
52
53 float4 b0 = vload4(0, (__global float *)src.ptr);
54
55 vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
56}
57
58/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
59 *
60 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
61 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
62 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
63 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
64 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
65 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
66 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
67 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
68 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
69 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
70 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
71 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
72 */
73__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
74 IMAGE_DECLARATION(dst))
75{
76 uint x = get_global_id(0);
77 uint y = get_global_id(1);
78
79 /* Compute address for Matrix B - source */
80 Image src = CONVERT_TO_IMAGE_STRUCT(src);
81
82 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
83 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
84
85 half8 b0 = vload8(0, (__global half *)src.ptr);
86
87 vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
88}
89
90/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
91 *
92 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8
93 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
94 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
96 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
98 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8
99 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
100 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
102 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
104 */
105__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
106 IMAGE_DECLARATION(dst))
107{
108 uint x = get_global_id(0);
109 uint y = get_global_id(1);
110
111 /* Compute address for Matrix B - source */
112 Image src = CONVERT_TO_IMAGE_STRUCT(src);
113
114 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
115 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
116
117 uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
118
119 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
120}
121
122/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
123 *
124 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
125 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
126 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
128 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
130 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U32/S32/F32
131 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
132 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
133 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
134 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
135 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
136 */
137__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
138 IMAGE_DECLARATION(dst))
139{
140 /* Compute source and destination addresses */
141 Image src = CONVERT_TO_IMAGE_STRUCT(src);
142 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
143
144 /* Load values from Matrix A */
145 float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
146 float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
147 float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
148 float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
149
150 float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
151 vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
152
153 val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
154 vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
155
156 val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
157 vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
158
159 val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
160 vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
161}
162
163/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
164 *
165 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/F16
166 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
167 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
169 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
171 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U16/S16/F16
172 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
173 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
174 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
175 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
176 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
177 */
178__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
179 IMAGE_DECLARATION(dst))
180{
181 /* Compute source and destination addresses */
182 Image src = CONVERT_TO_IMAGE_STRUCT(src);
183 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
184
185 /* Load values from Matrix A */
186 half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
187 half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
188 half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
189 half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
190
191 half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
192 vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
193
194 val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
195 vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
196
197 val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
198 vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
199
200 val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
201 vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
202}
203
204/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
205 *
206 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8
207 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
208 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
209 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
210 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
211 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
212 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8/S8
213 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
214 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
215 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
216 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
217 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
218 */
219__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
220 IMAGE_DECLARATION(dst))
221{
222 /* Compute source and destination addresses */
223 Image src = CONVERT_TO_IMAGE_STRUCT(src);
224 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
225
226 /* Load values from Matrix A */
227 uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
228 uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
229 uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
230 uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
231
232 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
233 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
234 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
235
236 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
237 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
238 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
239
240 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
241 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
242 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
243
244 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
245 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
246 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
247}
248
249/** This kernel accumulates each row with the biases vector
250 *
251 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
252 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
253 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
254 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
255 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
256 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
257 * @param[in] biases_ptr Pointer to the biases vector. Same as input.
258 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
259 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
260 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
261 */
262__kernel void gemm_accumulate_biases_f32(
263 IMAGE_DECLARATION(accum),
264 VECTOR_DECLARATION(biases))
265{
266 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
267 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
268
269 float4 accum_value = vload4(0, (__global float *)accum.ptr);
270 float4 biases_value = vload4(0, (__global float *)biases.ptr);
271 accum_value = biases_value + accum_value;
272
273 // Store result in the accummulate buffer
274 vstore4(accum_value, 0, (__global float *)accum.ptr);
275}
276
277/** This kernel accumulates each row with the biases vector
278 *
279 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
280 * @param[in] accum_stride_x Stride of the accumulate tensor in X dimension (in bytes)
281 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
282 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
283 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
284 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
285 * @param[in] biases_ptr Pointer to the biases vector. Same as input.
286 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
287 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
289 */
290__kernel void gemm_accumulate_biases_f16(
291 IMAGE_DECLARATION(accum),
292 VECTOR_DECLARATION(biases))
293{
294 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
295 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
296
297 half8 accum_value = vload8(0, (__global half *)accum.ptr);
298 half8 biases_value = vload8(0, (__global half *)biases.ptr);
299 accum_value = biases_value + accum_value;
300
301 // Store result in the accummulate buffer
302 vstore8(accum_value, 0, (__global half *)accum.ptr);
303}
304
305#if(defined WIDTH_MATRIX_B)
306/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
307 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
308 *
309 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
310 *
311 * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
312 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
313 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
314 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
315 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
316 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
317 * @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8
318 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
319 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
320 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
321 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
322 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
323 * @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8
324 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
325 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
326 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
327 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
328 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
329 * @param[in] a_offset Offset to be added to each element of the matrix A
330 * @param[in] b_offset Offset to be added to each element of the matrix B.
331 * @param[in] c_offset Offset to be added to each element of the matrix C.
332 * @param[in] c_mult_int Multiplied with each element of the matrix C.
333 * @param[in] shift Number of bits to shift right the result.
334 */
335__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
336 IMAGE_DECLARATION(src1),
337 IMAGE_DECLARATION(dst),
338 int a_offset,
339 int b_offset,
340 int c_offset,
341 int c_mult_int,
342 int shift)
343{
344 /* src_addr.s0 = address of matrix A */
345 /* src_addr.s1 = address of matrix B */
346
347 /* Compute address for matrix A and B */
348 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
349 (src1_stride_y));
350
351 /* Add offset_first_element_in_bytes */
352 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
353
354 /* Compute end row address for matrix B */
355 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
356
357 /* Reset accumulators */
358 int16 c00 = 0.0f;
359 int16 c10 = 0.0f;
360 int16 c20 = 0.0f;
361 int16 c30 = 0.0f;
362
363 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
364 {
365 /* Load values from matrix A (interleaved) and matrix B (transposed) */
366 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
367 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
368
369 c00 += (int16)a0.s0 * b0;
370 c10 += (int16)a0.s1 * b0;
371 c20 += (int16)a0.s2 * b0;
372 c30 += (int16)a0.s3 * b0;
373
374 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
375
376 c00 += (int16)a0.s4 * b1;
377 c10 += (int16)a0.s5 * b1;
378 c20 += (int16)a0.s6 * b1;
379 c30 += (int16)a0.s7 * b1;
380 }
381
382 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
383 {
384 /* Load values from matrix A (interleaved) and matrix B (transposed) */
385 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
386 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
387
388 c00 += (int16)a0.s0 * b0;
389 c10 += (int16)a0.s1 * b0;
390 c20 += (int16)a0.s2 * b0;
391 c30 += (int16)a0.s3 * b0;
392 }
393
394 /* Compute destination address */
395 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
396
397 /* Multiply by the weight of matrix product */
398 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
399 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
400 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
401 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
402
403 /* Store 4x16 block */
404 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
405 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
406 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
407 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
408}
409#endif
410
411#if(defined WIDTH_MATRIX_B && defined ALPHA)
412/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
413 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
414 *
415 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
416 *
417 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
418 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
419 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
420 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
421 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
422 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
423 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
424 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
425 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
426 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
427 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
428 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
429 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
430 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
431 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
432 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
433 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
434 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
435 */
436__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
437 IMAGE_DECLARATION(src1),
438 IMAGE_DECLARATION(dst))
439{
440 /* src_addr.s0 = address of matrix A */
441 /* src_addr.s1 = address of matrix B */
442
443 /* Compute address for matrix A and B */
444 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
445 (src1_stride_y));
446
447 /* Add offset_first_element_in_bytes */
448 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
449
450 /* Divide by 4 in order to get the src_addr in unit of float */
451 src_addr = src_addr >> 2;
452
453 /* Compute end row address for matrix B */
454 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
455
456 /* Reset accumulators */
457 float4 c00 = 0.0f;
458 float4 c10 = 0.0f;
459 float4 c20 = 0.0f;
460 float4 c30 = 0.0f;
461
462 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
463 {
464 /* Load values from matrix A (interleaved) and matrix B (transposed) */
465 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
466 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
467
468 c00 += (float4)a0.s0 * b0;
469 c10 += (float4)a0.s1 * b0;
470 c20 += (float4)a0.s2 * b0;
471 c30 += (float4)a0.s3 * b0;
472
473 /* Load values from matrix A (interleaved) and matrix B (transposed) */
474 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
475 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
476
477 c00 += (float4)a0.s0 * b0;
478 c10 += (float4)a0.s1 * b0;
479 c20 += (float4)a0.s2 * b0;
480 c30 += (float4)a0.s3 * b0;
481 }
482
483 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
484 {
485 /* Load values from matrix A (interleaved) and matrix B (transposed) */
486 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
487 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
488
489 c00 += (float4)a0.s0 * b0;
490 c10 += (float4)a0.s1 * b0;
491 c20 += (float4)a0.s2 * b0;
492 c30 += (float4)a0.s3 * b0;
493 }
494
495 /* Compute destination address */
496 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
497
498 /* Multiply by the weight of matrix product */
499 c00 = c00 * (float4)ALPHA;
500 c10 = c10 * (float4)ALPHA;
501 c20 = c20 * (float4)ALPHA;
502 c30 = c30 * (float4)ALPHA;
503
504 /* Store 4x4 block */
505 vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
506 vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
507 vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
508 vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
509}
510
511/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
512 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
513 *
514 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
515 *
516 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
517 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
518 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
519 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
520 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
521 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
522 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
523 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
524 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
525 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
526 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
527 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
528 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
529 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
530 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
531 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
532 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
533 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
534 */
535__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
536 IMAGE_DECLARATION(src1),
537 IMAGE_DECLARATION(dst))
538{
539 // src_addr_a = address of matrix A
540 // src_addr_b = address of matrix B
541 __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
542 __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
543
544 // Compute end row address for matrix B
545 __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
546
547 // Reset accumulators
548 float c00 = 0.0f;
549 float c01 = 0.0f;
550 float c02 = 0.0f;
551 float c03 = 0.0f;
552 float c10 = 0.0f;
553 float c11 = 0.0f;
554 float c12 = 0.0f;
555 float c13 = 0.0f;
556 float c20 = 0.0f;
557 float c21 = 0.0f;
558 float c22 = 0.0f;
559 float c23 = 0.0f;
560 float c30 = 0.0f;
561 float c31 = 0.0f;
562 float c32 = 0.0f;
563 float c33 = 0.0f;
564
565 for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
566 {
567 // Load values from matrix A (interleaved) and matrix B (transposed)
568 float4 a0 = vload4(0, src_addr_a);
569 float4 b0 = vload4(0, src_addr_b);
570
571 c00 = fma(a0.s0, b0.s0, c00);
572 c01 = fma(a0.s0, b0.s1, c01);
573 c02 = fma(a0.s0, b0.s2, c02);
574 c03 = fma(a0.s0, b0.s3, c03);
575
576 c10 = fma(a0.s1, b0.s0, c10);
577 c11 = fma(a0.s1, b0.s1, c11);
578 c12 = fma(a0.s1, b0.s2, c12);
579 c13 = fma(a0.s1, b0.s3, c13);
580
581 c20 = fma(a0.s2, b0.s0, c20);
582 c21 = fma(a0.s2, b0.s1, c21);
583 c22 = fma(a0.s2, b0.s2, c22);
584 c23 = fma(a0.s2, b0.s3, c23);
585
586 c30 = fma(a0.s3, b0.s0, c30);
587 c31 = fma(a0.s3, b0.s1, c31);
588 c32 = fma(a0.s3, b0.s2, c32);
589 c33 = fma(a0.s3, b0.s3, c33);
590
591 // Load values from matrix A (interleaved) and matrix B (transposed)
592 a0 = vload4(0, src_addr_a + 4);
593 b0 = vload4(0, src_addr_b + 4);
594
595 c00 = fma(a0.s0, b0.s0, c00);
596 c01 = fma(a0.s0, b0.s1, c01);
597 c02 = fma(a0.s0, b0.s2, c02);
598 c03 = fma(a0.s0, b0.s3, c03);
599
600 c10 = fma(a0.s1, b0.s0, c10);
601 c11 = fma(a0.s1, b0.s1, c11);
602 c12 = fma(a0.s1, b0.s2, c12);
603 c13 = fma(a0.s1, b0.s3, c13);
604
605 c20 = fma(a0.s2, b0.s0, c20);
606 c21 = fma(a0.s2, b0.s1, c21);
607 c22 = fma(a0.s2, b0.s2, c22);
608 c23 = fma(a0.s2, b0.s3, c23);
609
610 c30 = fma(a0.s3, b0.s0, c30);
611 c31 = fma(a0.s3, b0.s1, c31);
612 c32 = fma(a0.s3, b0.s2, c32);
613 c33 = fma(a0.s3, b0.s3, c33);
614
615 // Load values from matrix A (interleaved) and matrix B (transposed)
616 a0 = vload4(0, src_addr_a + 8);
617 b0 = vload4(0, src_addr_b + 8);
618
619 c00 = fma(a0.s0, b0.s0, c00);
620 c01 = fma(a0.s0, b0.s1, c01);
621 c02 = fma(a0.s0, b0.s2, c02);
622 c03 = fma(a0.s0, b0.s3, c03);
623
624 c10 = fma(a0.s1, b0.s0, c10);
625 c11 = fma(a0.s1, b0.s1, c11);
626 c12 = fma(a0.s1, b0.s2, c12);
627 c13 = fma(a0.s1, b0.s3, c13);
628
629 c20 = fma(a0.s2, b0.s0, c20);
630 c21 = fma(a0.s2, b0.s1, c21);
631 c22 = fma(a0.s2, b0.s2, c22);
632 c23 = fma(a0.s2, b0.s3, c23);
633
634 c30 = fma(a0.s3, b0.s0, c30);
635 c31 = fma(a0.s3, b0.s1, c31);
636 c32 = fma(a0.s3, b0.s2, c32);
637 c33 = fma(a0.s3, b0.s3, c33);
638
639 // Load values from matrix A (interleaved) and matrix B (transposed)
640 a0 = vload4(0, src_addr_a + 12);
641 b0 = vload4(0, src_addr_b + 12);
642
643 c00 = fma(a0.s0, b0.s0, c00);
644 c01 = fma(a0.s0, b0.s1, c01);
645 c02 = fma(a0.s0, b0.s2, c02);
646 c03 = fma(a0.s0, b0.s3, c03);
647
648 c10 = fma(a0.s1, b0.s0, c10);
649 c11 = fma(a0.s1, b0.s1, c11);
650 c12 = fma(a0.s1, b0.s2, c12);
651 c13 = fma(a0.s1, b0.s3, c13);
652
653 c20 = fma(a0.s2, b0.s0, c20);
654 c21 = fma(a0.s2, b0.s1, c21);
655 c22 = fma(a0.s2, b0.s2, c22);
656 c23 = fma(a0.s2, b0.s3, c23);
657
658 c30 = fma(a0.s3, b0.s0, c30);
659 c31 = fma(a0.s3, b0.s1, c31);
660 c32 = fma(a0.s3, b0.s2, c32);
661 c33 = fma(a0.s3, b0.s3, c33);
662 }
663
664 for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
665 {
666 // Load values from matrix A (interleaved) and matrix B (transposed)
667 float4 a0 = vload4(0, src_addr_a);
668 float4 b0 = vload4(0, src_addr_b);
669
670 c00 = fma(a0.s0, b0.s0, c00);
671 c01 = fma(a0.s0, b0.s1, c01);
672 c02 = fma(a0.s0, b0.s2, c02);
673 c03 = fma(a0.s0, b0.s3, c03);
674
675 c10 = fma(a0.s1, b0.s0, c10);
676 c11 = fma(a0.s1, b0.s1, c11);
677 c12 = fma(a0.s1, b0.s2, c12);
678 c13 = fma(a0.s1, b0.s3, c13);
679
680 c20 = fma(a0.s2, b0.s0, c20);
681 c21 = fma(a0.s2, b0.s1, c21);
682 c22 = fma(a0.s2, b0.s2, c22);
683 c23 = fma(a0.s2, b0.s3, c23);
684
685 c30 = fma(a0.s3, b0.s0, c30);
686 c31 = fma(a0.s3, b0.s1, c31);
687 c32 = fma(a0.s3, b0.s2, c32);
688 c33 = fma(a0.s3, b0.s3, c33);
689 }
690
691 // Compute destination address
692 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
693
694 // Multiply by the weight of matrix product
695 c00 = c00 * ALPHA;
696 c01 = c01 * ALPHA;
697 c02 = c02 * ALPHA;
698 c03 = c03 * ALPHA;
699 c10 = c10 * ALPHA;
700 c11 = c11 * ALPHA;
701 c12 = c12 * ALPHA;
702 c13 = c13 * ALPHA;
703 c20 = c20 * ALPHA;
704 c21 = c21 * ALPHA;
705 c22 = c22 * ALPHA;
706 c23 = c23 * ALPHA;
707 c30 = c30 * ALPHA;
708 c31 = c31 * ALPHA;
709 c32 = c32 * ALPHA;
710 c33 = c33 * ALPHA;
711
712 barrier(CLK_GLOBAL_MEM_FENCE);
713
714 // Store 4x4 block
715 vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
716 vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
717 vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
718 vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
719}
720
721/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
722 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
723 *
724 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
725 *
726 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
727 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
728 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
729 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
730 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
731 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
732 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
733 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
734 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
735 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
736 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
737 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
738 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
739 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
740 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
741 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
742 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
743 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
744 */
745__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
746 IMAGE_DECLARATION(src1),
747 IMAGE_DECLARATION(dst))
748{
749 /* src_addr.s0 = address of matrix A */
750 /* src_addr.s1 = address of matrix B */
751
752 /* Compute address for matrix A and B */
753 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
754 (src1_stride_y));
755
756 /* Add offset_first_element_in_bytes */
757 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
758
759 /* Divide by 2 in order to get the src_addr in unit of half */
760 src_addr = src_addr >> 1;
761
762 /* Compute end row address for matrix B */
763 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
764
765 /* Reset accumulators */
766 half8 c00 = 0.0f;
767 half8 c10 = 0.0f;
768 half8 c20 = 0.0f;
769 half8 c30 = 0.0f;
770
771 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
772 {
773 /* Load values from matrix A (interleaved) and matrix B (transposed) */
774 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
775 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
776
777 c00 += (half8)a0.s0 * b0;
778 c10 += (half8)a0.s1 * b0;
779 c20 += (half8)a0.s2 * b0;
780 c30 += (half8)a0.s3 * b0;
781
782 /* Load values from matrix A (interleaved) and matrix B (transposed) */
783 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
784 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
785
786 c00 += (half8)a0.s0 * b0;
787 c10 += (half8)a0.s1 * b0;
788 c20 += (half8)a0.s2 * b0;
789 c30 += (half8)a0.s3 * b0;
790 }
791
792 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
793 {
794 /* Load values from matrix A (interleaved) and matrix B (transposed) */
795 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
796 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
797
798 c00 += (half8)a0.s0 * b0;
799 c10 += (half8)a0.s1 * b0;
800 c20 += (half8)a0.s2 * b0;
801 c30 += (half8)a0.s3 * b0;
802 }
803
804 /* Compute destination address */
805 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
806
807 /* Multiply by the weight of matrix product */
808 c00 = c00 * (half8)ALPHA;
809 c10 = c10 * (half8)ALPHA;
810 c20 = c20 * (half8)ALPHA;
811 c30 = c30 * (half8)ALPHA;
812
813 /* Store 4x8 block */
814 vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
815 vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
816 vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
817 vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
818}
819
820#if(defined WIDTH_VECTOR_A)
821/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
822 *
823 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
824 *
825 * @attention The input vector A and matrix B must not be reshaped
826 *
827 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
828 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
829 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
830 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
831 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
832 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
833 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
834 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
835 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
836 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
837 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
838 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
839 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
840 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
841 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
842 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
843 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
844 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
845 */
846__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
847 IMAGE_DECLARATION(src1),
848 IMAGE_DECLARATION(dst))
849{
850 int idx = get_global_id(0) * 4;
851
852 /* Compute the address for the vector A and matrix B */
853 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
854 src_addr.s1 += idx * sizeof(float);
855
856 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
857
858 float4 acc = 0.0f;
859
860 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
861 {
862 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
863 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
864 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
865
866 acc += b0 * (float4)a0.s0;
867 acc += b1 * (float4)a0.s1;
868 }
869
870 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
871 {
872 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
873 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
874
875 acc += b0 * (float4)a0;
876 }
877
878 /* Compute destination address */
879 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
880
881 /* Multiply by the weight of vector-matrix product */
882 acc = acc * (float4)ALPHA;
883
884 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
885}
886
887/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
888 *
889 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
890 *
891 * @attention The input vector A and matrix B must not be reshaped
892 *
893 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
894 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
895 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
896 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
897 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
898 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
899 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
900 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
901 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
902 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
903 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
904 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
905 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
906 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
907 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
908 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
909 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
910 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
911 */
912__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
913 IMAGE_DECLARATION(src1),
914 IMAGE_DECLARATION(dst))
915{
916 int idx = get_global_id(0) * 8;
917
918 /* Compute the address for the vector A and matrix B */
919 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
920 src_addr.s1 += idx * sizeof(half);
921
922 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
923
924 half8 acc = 0.0f;
925
926 for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
927 {
928 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
929 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
930 half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
931 half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
932 half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
933
934 acc += b0 * (half8)a0.s0;
935 acc += b1 * (half8)a0.s1;
936 acc += b2 * (half8)a0.s2;
937 acc += b3 * (half8)a0.s3;
938 }
939
940 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
941 {
942 half a0 = *((__global half *)(src0_ptr + src_addr.s0));
943 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
944
945 acc += b0 * (half8)a0;
946 }
947
948 /* Compute destination address */
949 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
950
951 /* Multiply by the weight of vector-matrix product */
952 acc = acc * (half8)ALPHA;
953
954 vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
955}
956#endif /* (defined WIDTH_VECTOR_A) */
957#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
958
959#if(defined BETA)
960/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
961 *
962 * @attention The beta's value need to be passed at compile time using -DBETA
963 *
964 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
965 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
966 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
967 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
968 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
969 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
970 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
971 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
972 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
973 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
974 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
975 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
976 */
977__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
978 IMAGE_DECLARATION(dst))
979{
980 /* Compute source and destination addresses */
981 Image src = CONVERT_TO_IMAGE_STRUCT(src);
982 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
983
984 /* Load values from A x B */
985 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
986
987 /* Load values from Matrix C */
988 float4 c = vload4(0, (__global float *)src.ptr);
989
990 /* Computes alpha * axb + beta * c */
991 float4 out = alpha_ab + (float4)BETA * c;
992
993 /* Store final result in axb matrix */
994 vstore4(out, 0, (__global float *)dst.ptr);
995}
996
997/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
998 *
999 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
1000 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1001 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1002 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1003 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1004 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
1005 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
1006 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1007 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1008 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1009 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1010 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1011 */
1012__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
1013 IMAGE_DECLARATION(dst))
1014{
1015 /* Compute source and destination addresses */
1016 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1017 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1018
1019 /* Load values from A x B */
1020 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
1021
1022 /* Load values from Matrix C */
1023 half8 c = vload8(0, (__global half *)src.ptr);
1024
1025 /* Computes alpha * axb + beta * c */
1026 half8 out = alpha_ab + (half8)BETA * c;
1027
1028 /* Store final result in axb matrix */
1029 vstore8(out, 0, (__global half *)dst.ptr);
1030}
1031#endif /* (defined BETA) */
1032
1033#if(defined WIDTH_VECTOR_A)
1034/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
1035 *
1036 * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
1037 *
1038 * @attention The input A and matrix B must not be reshaped
1039 *
1040 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
1041 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1042 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1043 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1044 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1045 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1046 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
1047 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1048 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1050 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1052 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1053 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1054 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
1055 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1056 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1057 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1058 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1059 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1060 */
1061__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
1062 TENSOR3D_DECLARATION(src1),
1063 IMAGE_DECLARATION(dst))
1064{
1065 int idx = get_global_id(0) * 4;
1066 int idy = get_global_id(1);
1067
1068 /* Compute the address for the vector A and matrix B */
1069 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
1070 src_addr.s1 += idx * sizeof(float);
1071
1072 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
1073
1074 float4 acc = 0.0f;
1075
1076 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
1077 {
1078 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
1079 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1080 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
1081
1082 acc += b0 * (float4)a0.s0;
1083 acc += b1 * (float4)a0.s1;
1084 }
1085
1086 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
1087 {
1088 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
1089 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1090
1091 acc += b0 * (float4)a0;
1092 }
1093
1094 /* Compute destination address */
1095 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1096
1097 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
1098}
1099#endif /* (defined WIDTH_VECTOR_A) */