blob: 3313b88718aef341478d911993a2d3e6bab82b29 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
25#include "helpers.h"
26
27#if defined(DATA_TYPE_FP32)
28#define LOAD8(r, name, offset) \
29 r.x = LOAD4(name, offset); \
30 r.y = LOAD4(name, offset + uint(1))
31
32#define LOAD16(r, name, offset) \
33 r.x = LOAD4(name, offset); \
34 r.y = LOAD4(name, offset + uint(1)); \
35 r.z = LOAD4(name, offset + uint(2)); \
36 r.w = LOAD4(name, offset + uint(3))
37
38#define STORE16(name, offset, r) \
39 STORE4(name, offset, r.x); \
40 STORE4(name, offset + uint(1), r.y); \
41 STORE4(name, offset + uint(2), r.z); \
42 STORE4(name, offset + uint(3), r.w)
43
44#ifdef GEMM_TRANSPOSE1xW
45BUFFER_DECLARATION(src, 1, float, readonly);
46BUFFER_DECLARATION(dst, 2, float, writeonly);
47
48layout(std140) uniform shader_params
49{
50 IMAGE_PARAM_DECLARATION(src);
51 IMAGE_PARAM_DECLARATION(dst);
52};
53
54/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
55 *
56 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
57 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
58 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
59 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
60 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
61 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
62 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
63 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
64 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
65 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
66 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
67 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
68 */
69void main(void)
70{
71 /* Compute address for Matrix B - source */
72 Image src = CONVERT_TO_IMAGE_STRUCT(src);
73 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
74
75 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
76 uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2;
77 vec4 b0;
78 LOAD16(b0, src, offset(src, 0, 0));
79 STORE16(dst, dst_addr_in_bytes, b0);
80}
81#endif /* GEMM_TRANSPOSE1xW */
82
83#ifdef GEMM_INTERLEAVE4x4
84BUFFER_DECLARATION(src, 1, float, readonly);
85BUFFER_DECLARATION(dst, 2, float, writeonly);
86
87layout(std140) uniform shader_params
88{
89 IMAGE_PARAM_DECLARATION(src);
90 IMAGE_PARAM_DECLARATION(dst);
91};
92
93/** This OpenGLES kernel reshapes the input matrix interleaving the values
94 *
95 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
96 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
97 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
98 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
99 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
100 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
101 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
102 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
103 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
104 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
105 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
106 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
107 */
108void main(void)
109{
110 /* Compute source and destination addresses */
111 Image src = CONVERT_TO_IMAGE_STRUCT(src);
112 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
113
114 int i;
115 int j;
116
117 for(i = 0; i < 4; ++i)
118 {
119 for(j = 0; j < 4; ++j)
120 {
121 float res = LOAD4(src, offset(src, i, j));
122 uint ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j);
123 STORE4(dst, ofset0, res);
124 }
125 }
126}
127#endif /* GEMM_INTERLEAVE4x4 */
128
129#ifdef GEMM_ACCUMULATE_BIASES
130BUFFER_DECLARATION(accum, 1, float, restrict);
131BUFFER_DECLARATION(biases, 2, float, readonly);
132
133layout(std140) uniform shader_params
134{
135 IMAGE_PARAM_DECLARATION(accum);
136 VECTOR_PARAM_DECLARATION(biases);
137};
138
139/** This kernel accumulates each row with the biases vector
140 *
141 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
142 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
143 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
144 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
145 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
146 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
147 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
148 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
149 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
150 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
151 */
152void main(void)
153{
154 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
155 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
156
157 for(int i = 0; i < 16; ++i)
158 {
159 float accum_value = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i));
160 float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i));
161 accum_value = biases_value + accum_value;
162
163 // Store result in the accummulate buffer
164 STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value);
165 }
166}
167#endif /* GEMM_ACCUMULATE_BIASES */
168
169#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
170BUFFER_DECLARATION(src0, 1, float, readonly);
171BUFFER_DECLARATION(src1, 2, float, readonly);
172BUFFER_DECLARATION(dst, 3, float, writeonly);
173
174layout(std140) uniform shader_params
175{
176 IMAGE_PARAM_DECLARATION(src0);
177 IMAGE_PARAM_DECLARATION(src1);
178 IMAGE_PARAM_DECLARATION(dst);
179};
180
181/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
182 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
183 *
184 * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
185 *
186 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
187 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
188 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
189 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
190 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
191 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
192 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
193 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
194 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
195 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
196 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
197 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
198 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
199 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
200 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
201 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
202 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
203 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
204 */
205void main()
206{
207 Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
208 Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
209 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
210
211 /* Compute address for matrix A and B */
212 src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);
213 src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);
214
215 /* Compute end row address for matrix B */
216 int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);
217
218 /* Reset accumulators */
219 vec4 c00 = vec4(0.0f);
220 vec4 c10 = vec4(0.0f);
221 vec4 c20 = vec4(0.0f);
222 vec4 c30 = vec4(0.0f);
223
224 // FIXME: loop unrolling really needed for GLES?
225 for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))
226 {
227 /* Load values from matrix A (interleaved) and matrix B (transposed) */
228 vec4 a0;
229 vec4 b0;
230 LOAD16(a0, src0, src0.current_offset);
231 LOAD16(b0, src1, src1.current_offset);
232
233 c00 += vec4(a0.x) * b0;
234 c10 += vec4(a0.y) * b0;
235 c20 += vec4(a0.z) * b0;
236 c30 += vec4(a0.w) * b0;
237
238 /* Load values from matrix A (interleaved) and matrix B (transposed) */
239 LOAD16(a0, src0, src0.current_offset + uint(4));
240 LOAD16(b0, src1, src1.current_offset + uint(4));
241
242 c00 += vec4(a0.x) * b0;
243 c10 += vec4(a0.y) * b0;
244 c20 += vec4(a0.z) * b0;
245 c30 += vec4(a0.w) * b0;
246 }
247
248 for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))
249 {
250 /* Load values from matrix A (interleaved) and matrix B (transposed) */
251 vec4 a0;
252 vec4 b0;
253 LOAD16(a0, src0, src0.current_offset);
254 LOAD16(b0, src1, src1.current_offset);
255
256 c00 += vec4(a0.x) * b0;
257 c10 += vec4(a0.y) * b0;
258 c20 += vec4(a0.z) * b0;
259 c30 += vec4(a0.w) * b0;
260 }
261
262 /* Multiply by the weight of matrix product */
263 c00 = c00 * vec4(ALPHA);
264 c10 = c10 * vec4(ALPHA);
265 c20 = c20 * vec4(ALPHA);
266 c30 = c30 * vec4(ALPHA);
267
268 /* Store 4x4 block */
269 STORE16(dst, offset(dst, 0, 0), c00);
270 STORE16(dst, offset(dst, 0, 1), c10);
271 STORE16(dst, offset(dst, 0, 2), c20);
272 STORE16(dst, offset(dst, 0, 3), c30);
273}
274#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
275
276#ifdef GEMM_MM_FLOATING_POINT
277BUFFER_DECLARATION(src0, 1, float, readonly);
278BUFFER_DECLARATION(src1, 2, float, readonly);
279BUFFER_DECLARATION(dst, 3, float, writeonly);
280
281layout(std140) uniform shader_params
282{
283 IMAGE_PARAM_DECLARATION(src0);
284 IMAGE_PARAM_DECLARATION(src1);
285 IMAGE_PARAM_DECLARATION(dst);
286};
287
288/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
289 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
290 *
291 * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
292 *
293 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
294 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
295 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
297 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
298 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
299 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
300 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
301 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
302 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
303 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
304 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
305 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
306 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
307 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
308 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
309 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
310 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
311 */
312void main()
313{
314 Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
315 Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
316 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
317
318 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
319 /* Compute the address for the vector A and matrix B */
320 src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);
321 src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);
322
323 /* Compute end row address for matrix A */
324 int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);
325
326 /* Reset accumulators */
327 vec4 acc0 = vec4(0.0f);
328#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
329 vec4 acc1 = vec4(0.0f);
330#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
331#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
332 vec4 acc2 = vec4(0.0f);
333#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
334#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
335 vec4 acc3 = vec4(0.0f);
336#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
337
338 for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))
339 {
340 vec2 a0;
341 LOAD8(a0, src0, src0.current_offset);
342#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
343 vec2 a1;
344 LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));
345#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
346#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
347 vec2 a2;
348 LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
351 vec2 a3;
352 LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
353#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
354
355 vec4 b0;
356 vec4 b1;
357 LOAD16(b0, src1, src1.current_offset);
358 LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));
359
360 acc0 += b0 * vec4(a0.x);
361 acc0 += b1 * vec4(a0.y);
362#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
363 acc1 += b0 * vec4(a1.x);
364 acc1 += b1 * vec4(a1.y);
365#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
366#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
367 acc2 += b0 * vec4(a2.x);
368 acc2 += b1 * vec4(a2.y);
369#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
370#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
371 acc3 += b0 * vec4(a3.x);
372 acc3 += b1 * vec4(a3.y);
373#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
374 }
375
376 for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))
377 {
378 // Load values from matrix A
379 float a0;
380 a0 = LOAD4(src0, src0.current_offset);
381#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
382 float a1;
383 a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));
384#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
385#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
386 float a2;
387 a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
388#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
389#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
390 float a3;
391 a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
392#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
393
394 vec4 b0;
395 LOAD16(b0, src1, src1.current_offset);
396
397 acc0 += b0 * vec4(a0);
398#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
399 acc1 += b0 * vec4(a1);
400#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
401#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
402 acc2 += b0 * vec4(a2);
403#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
404#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
405 acc3 += b0 * vec4(a3);
406#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
407 }
408
409 /* Multiply by the weight of vector-matrix product */
410 acc0 = acc0 * vec4(ALPHA);
411 STORE16(dst, offset(dst, 0, 0), acc0);
412#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
413 acc1 = acc1 * vec4(ALPHA);
414 STORE16(dst, offset(dst, 0, 1), acc1);
415#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
416#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
417 acc2 = acc2 * vec4(ALPHA);
418 STORE16(dst, offset(dst, 0, 2), acc2);
419#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
420#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
421 acc3 = acc3 * vec4(ALPHA);
422 STORE16(dst, offset(dst, 0, 3), acc3);
423#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
424}
425#endif /* GEMM_MM_FLOATING_POINT */
426
427#ifdef GEMM_MATRIXADDITION
428BUFFER_DECLARATION(src, 1, float, readonly);
429BUFFER_DECLARATION(dst, 2, float, restrict);
430
431layout(std140) uniform shader_params
432{
433 IMAGE_PARAM_DECLARATION(src);
434 IMAGE_PARAM_DECLARATION(dst);
435};
436
437/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
438 *
439 * @attention The beta's value need to be passed at compile time using BETA
440 *
441 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
442 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
443 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
444 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
445 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
446 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
447 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
448 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
449 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
450 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
451 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
452 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
453 */
454void main(void)
455{
456 /* Compute source and destination addresses */
457 Image src = CONVERT_TO_IMAGE_STRUCT(src);
458 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
459
460 /* Load values from A x B */
461 vec4 alpha_ab;
462 vec4 c;
463 vec4 out1;
464
465 LOAD16(alpha_ab, dst, dst.current_offset);
466 LOAD16(c, src, src.current_offset);
467
468 /* Computes alpha * axb + beta * c */
469 out1 = alpha_ab + vec4(BETA * c);
470
471 /* Store final result in axb matrix */
472 STORE16(dst, dst.current_offset, out1);
473}
474#endif /* GEMM_MATRIXADDITION */
475#elif defined(DATA_TYPE_FP16)
476precision mediump float;
477#ifdef GEMM_MM_FLOATING_POINT
478BUFFER_DECLARATION(src0, 1, uint, readonly);
479BUFFER_DECLARATION(src1, 2, uvec2, readonly);
480BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
481
482layout(std140) uniform shader_params
483{
484 IMAGE_PARAM_DECLARATION(src0);
485 IMAGE_PARAM_DECLARATION(src1);
486 IMAGE_PARAM_DECLARATION(dst);
487};
488
489/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
490 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
491 *
492 * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
493 *
494 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
495 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
496 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
497 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
498 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
499 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
500 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
501 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
502 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
503 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
504 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
505 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
506 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
507 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
508 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
509 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
510 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
511 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
512 */
513void main()
514{
515 Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
516 Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
517 Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
518
519 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
520 /* Compute the address for the vector A and matrix B */
521 src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
522 src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
523
524 /* Compute end row address for matrix A */
525 uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
526
527 /* Reset accumulators */
528 vec4 acc0 = vec4(0.0f);
529
530 for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
531 {
532 uint packed_a0;
533 vec2 a0;
534
535 GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
536 a0 = vec2(unpackHalf2x16(packed_a0));
537
538 uvec2 packed_b0;
539 uvec2 packed_b1;
540 vec4 b0;
541 vec4 b1;
542
543 GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
544 GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);
545
546 b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
547 b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));
548
549 acc0 += b0 * vec4(a0.x);
550 acc0 += b1 * vec4(a0.y);
551 }
552
553 for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
554 {
555 uint packed_a0;
556 vec2 a0;
557
558 GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
559 a0 = vec2(unpackHalf2x16(packed_a0));
560
561 uvec2 packed_b0;
562 vec4 b0;
563
564 GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
565
566 b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
567
568 acc0 += b0 * (a0.x);
569 }
570
571 /* Multiply by the weight of vector-matrix product */
572 acc0 = acc0 * vec4(ALPHA);
573
574 uvec2 packed_d;
575 packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
576 GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
577}
578#endif /* GEMM_MM_FLOATING_POINT */
579
580#ifdef GEMM_ACCUMULATE_BIASES
581BUFFER_DECLARATION(accum, 1, uvec2, restrict);
582BUFFER_DECLARATION(biases, 2, uvec2, readonly);
583
584layout(std140) uniform shader_params
585{
586 IMAGE_PARAM_DECLARATION(accum);
587 VECTOR_PARAM_DECLARATION(biases);
588};
589
590/** This kernel accumulates each row with the biases vector
591 *
592 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
593 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
594 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
595 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
596 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
597 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
598 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
599 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
600 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
601 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
602 */
603void main(void)
604{
605 Image accum = GC_CONVERT_TO_IMAGE_STRUCT(accum);
606 Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
607
608 vec4 u[2];
609 uvec2 packed_s[2];
610 GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
611 GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
612 u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
613 u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
614
615 vec4 tmp;
616 tmp = u[0] + u[1];
617 packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
618 GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
619}
620#endif /* GEMM_ACCUMULATE_BIASES */
621#else /* DATA_TYPE_F32 */
622#error Data type not supported
623#endif /* DATA_TYPE_F32 */