blob: e51908b5e5d05ea114ceabff91a1556c416963d8 [file] [log] [blame]
Anthony Barbier7068f992017-10-26 15:23:08 +01001/*
Stephen Lie855c232018-01-04 14:13:22 +08002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier7068f992017-10-26 15:23:08 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
zhenglin19e91422018-01-03 12:14:13 +080025#include "helpers_cs.h"
26
27#if defined(DATA_TYPE_FP16)
28precision mediump float;
29#endif // DATA_TYPE_FP16
Anthony Barbier7068f992017-10-26 15:23:08 +010030
31#if defined(DATA_TYPE_FP32)
Anthony Barbier7068f992017-10-26 15:23:08 +010032#ifdef GEMM_TRANSPOSE1xW
Anthony Barbier7068f992017-10-26 15:23:08 +010033/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
34 *
zhenglin19e91422018-01-03 12:14:13 +080035 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
36 * @param[in] src_attrs The attributes of the source matrix
37 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
38 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +010039 */
zhenglin19e91422018-01-03 12:14:13 +080040SHADER_PARAMS_DECLARATION
41{
42 ImageAttributes src_attrs;
43 ImageAttributes dst_attrs;
44};
45TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
46TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
47
Anthony Barbier7068f992017-10-26 15:23:08 +010048void main(void)
49{
50 /* Compute address for Matrix B - source */
zhenglin19e91422018-01-03 12:14:13 +080051 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
52 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010053
54 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
zhenglin19e91422018-01-03 12:14:13 +080055 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
56
57 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
58 VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, b0);
Anthony Barbier7068f992017-10-26 15:23:08 +010059}
60#endif /* GEMM_TRANSPOSE1xW */
61
62#ifdef GEMM_INTERLEAVE4x4
Anthony Barbier7068f992017-10-26 15:23:08 +010063/** This OpenGLES kernel reshapes the input matrix interleaving the values
64 *
zhenglin19e91422018-01-03 12:14:13 +080065 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
66 * @param[in] src_attrs The attributes of the source matrix
67 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
68 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +010069 */
zhenglin19e91422018-01-03 12:14:13 +080070SHADER_PARAMS_DECLARATION
71{
72 ImageAttributes src_attrs;
73 ImageAttributes dst_attrs;
74};
75TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
76TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
77
Anthony Barbier7068f992017-10-26 15:23:08 +010078void main(void)
79{
80 /* Compute source and destination addresses */
zhenglin19e91422018-01-03 12:14:13 +080081 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
82 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +010083
84 int i;
85 int j;
86
87 for(i = 0; i < 4; ++i)
88 {
89 for(j = 0; j < 4; ++j)
90 {
zhenglin19e91422018-01-03 12:14:13 +080091 float res = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, j));
92 STORE(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, (i * 4 + j)), res);
Anthony Barbier7068f992017-10-26 15:23:08 +010093 }
94 }
95}
96#endif /* GEMM_INTERLEAVE4x4 */
97
98#ifdef GEMM_ACCUMULATE_BIASES
Anthony Barbier7068f992017-10-26 15:23:08 +010099/** This kernel accumulates each row with the biases vector
100 *
zhenglin19e91422018-01-03 12:14:13 +0800101 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
102 * @param[in] accum_attrs The attributes of the accumulate tensor
103 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
104 * @param[in] biases_attrs The attributes of the biases tensor
Anthony Barbier7068f992017-10-26 15:23:08 +0100105 */
zhenglin19e91422018-01-03 12:14:13 +0800106SHADER_PARAMS_DECLARATION
107{
108 ImageAttributes accum_attrs;
109 VectorAttributes biases_attrs;
110};
111TENSOR_DECLARATION(1, accumBuffer, float, accum_ptr, accum_shift, 2, restrict);
112TENSOR_DECLARATION(2, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
113
Anthony Barbier7068f992017-10-26 15:23:08 +0100114void main(void)
115{
zhenglin19e91422018-01-03 12:14:13 +0800116 ImageIterator accum_iter = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
117 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100118
119 for(int i = 0; i < 16; ++i)
120 {
zhenglin19e91422018-01-03 12:14:13 +0800121 float accum_value = LOAD(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i));
122 float biases_value = LOAD(biases_ptr, TENSOR_OFFSET_ADVANCE(biases_iter, i));
Anthony Barbier7068f992017-10-26 15:23:08 +0100123 accum_value = biases_value + accum_value;
124
125 // Store result in the accummulate buffer
zhenglin19e91422018-01-03 12:14:13 +0800126 STORE(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i), accum_value);
Anthony Barbier7068f992017-10-26 15:23:08 +0100127 }
128}
129#endif /* GEMM_ACCUMULATE_BIASES */
130
131#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
Anthony Barbier7068f992017-10-26 15:23:08 +0100132/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
133 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
134 *
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +0100135 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Anthony Barbier7068f992017-10-26 15:23:08 +0100136 *
zhenglin19e91422018-01-03 12:14:13 +0800137 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
138 * @param[in] src0_attrs The attributes of the source matrix
139 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
140 * @param[in] src1_attrs The attributes of the source matrix
141 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
142 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +0100143 */
zhenglin19e91422018-01-03 12:14:13 +0800144SHADER_PARAMS_DECLARATION
145{
146 ImageAttributes src0_attrs;
147 ImageAttributes src1_attrs;
148 ImageAttributes dst_attrs;
149};
150TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
151TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
152TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
153
Anthony Barbier7068f992017-10-26 15:23:08 +0100154void main()
155{
zhenglin19e91422018-01-03 12:14:13 +0800156 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
157 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
158 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100159
160 /* Compute address for matrix A and B */
zhenglin19e91422018-01-03 12:14:13 +0800161 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
162 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
Anthony Barbier7068f992017-10-26 15:23:08 +0100163 /* Compute end row address for matrix B */
zhenglin19e91422018-01-03 12:14:13 +0800164 int end_row_mtx_b = int(TENSOR_OFFSET_ADVANCE(src1_iter, COLS_B));
Anthony Barbier7068f992017-10-26 15:23:08 +0100165
166 /* Reset accumulators */
167 vec4 c00 = vec4(0.0f);
168 vec4 c10 = vec4(0.0f);
169 vec4 c20 = vec4(0.0f);
170 vec4 c30 = vec4(0.0f);
171
172 // FIXME: loop unrolling really needed for GLES?
zhenglin19e91422018-01-03 12:14:13 +0800173 for(; int(CURRENT_ITEM_OFFSET(src1_iter)) <= (end_row_mtx_b - 8); TENSOR_ITERATOR_ADVANCE(src0_iter, 8), TENSOR_ITERATOR_ADVANCE(src1_iter, 8))
Anthony Barbier7068f992017-10-26 15:23:08 +0100174 {
175 /* Load values from matrix A (interleaved) and matrix B (transposed) */
zhenglin19e91422018-01-03 12:14:13 +0800176 vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
177 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100178
179 c00 += vec4(a0.x) * b0;
180 c10 += vec4(a0.y) * b0;
181 c20 += vec4(a0.z) * b0;
182 c30 += vec4(a0.w) * b0;
183
184 /* Load values from matrix A (interleaved) and matrix B (transposed) */
zhenglin19e91422018-01-03 12:14:13 +0800185 a0 = VLOAD4(vec4, src0_ptr, TENSOR_OFFSET_ADVANCE(src0_iter, 4));
186 b0 = VLOAD4(vec4, src1_ptr, TENSOR_OFFSET_ADVANCE(src1_iter, 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100187
188 c00 += vec4(a0.x) * b0;
189 c10 += vec4(a0.y) * b0;
190 c20 += vec4(a0.z) * b0;
191 c30 += vec4(a0.w) * b0;
192 }
193
zhenglin19e91422018-01-03 12:14:13 +0800194 for(; int(CURRENT_ITEM_OFFSET(src1_iter)) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE(src0_iter, 4), TENSOR_ITERATOR_ADVANCE(src1_iter, 4))
Anthony Barbier7068f992017-10-26 15:23:08 +0100195 {
196 /* Load values from matrix A (interleaved) and matrix B (transposed) */
zhenglin19e91422018-01-03 12:14:13 +0800197 vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
198 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100199
200 c00 += vec4(a0.x) * b0;
201 c10 += vec4(a0.y) * b0;
202 c20 += vec4(a0.z) * b0;
203 c30 += vec4(a0.w) * b0;
204 }
205
206 /* Multiply by the weight of matrix product */
207 c00 = c00 * vec4(ALPHA);
208 c10 = c10 * vec4(ALPHA);
209 c20 = c20 * vec4(ALPHA);
210 c30 = c30 * vec4(ALPHA);
211
212 /* Store 4x4 block */
zhenglin19e91422018-01-03 12:14:13 +0800213 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
214 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
215 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
216 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
Anthony Barbier7068f992017-10-26 15:23:08 +0100217}
218#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
219
220#ifdef GEMM_MM_FLOATING_POINT
Anthony Barbier7068f992017-10-26 15:23:08 +0100221/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
222 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
223 *
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +0100224 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
225 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
226 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Anthony Barbier7068f992017-10-26 15:23:08 +0100227 *
zhenglin19e91422018-01-03 12:14:13 +0800228 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
229 * @param[in] src0_attrs The attributes of the source matrix
230 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
231 * @param[in] src1_attrs The attributes of the source matrix
232 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
233 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +0100234 */
zhenglin19e91422018-01-03 12:14:13 +0800235SHADER_PARAMS_DECLARATION
236{
237 ImageAttributes src0_attrs;
238 ImageAttributes src1_attrs;
239 ImageAttributes dst_attrs;
240};
241TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
242TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
243TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
244
Anthony Barbier7068f992017-10-26 15:23:08 +0100245void main()
246{
zhenglin19e91422018-01-03 12:14:13 +0800247 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
248 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
249 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100250
251 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
252 /* Compute the address for the vector A and matrix B */
zhenglin19e91422018-01-03 12:14:13 +0800253 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
254 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
Anthony Barbier7068f992017-10-26 15:23:08 +0100255
256 /* Compute end row address for matrix A */
zhenglin19e91422018-01-03 12:14:13 +0800257 int end_row_vec_a = int(TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, COLS_A * 4));
Anthony Barbier7068f992017-10-26 15:23:08 +0100258
259 /* Reset accumulators */
260 vec4 acc0 = vec4(0.0f);
261#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
262 vec4 acc1 = vec4(0.0f);
263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
264#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
265 vec4 acc2 = vec4(0.0f);
266#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
267#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
268 vec4 acc3 = vec4(0.0f);
269#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
270
zhenglin19e91422018-01-03 12:14:13 +0800271 for(; int(CURRENT_ITEM_OFFSET(src0_iter)) <= (end_row_vec_a - 2); TENSOR_ITERATOR_ADVANCE(src0_iter, 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
Anthony Barbier7068f992017-10-26 15:23:08 +0100272 {
zhenglin19e91422018-01-03 12:14:13 +0800273 vec2 a0 = VLOAD2_CURRENT_ITEM(vec2, src0_ptr, src0_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100274#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800275 vec2 a1 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100276#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
277#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800278 vec2 a2 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100279#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
280#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800281 vec2 a3 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100282#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
283
zhenglin19e91422018-01-03 12:14:13 +0800284 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
285 vec4 b1 = VLOAD4(vec4, src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100286
287 acc0 += b0 * vec4(a0.x);
288 acc0 += b1 * vec4(a0.y);
289#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
290 acc1 += b0 * vec4(a1.x);
291 acc1 += b1 * vec4(a1.y);
292#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
293#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
294 acc2 += b0 * vec4(a2.x);
295 acc2 += b1 * vec4(a2.y);
296#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
297#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
298 acc3 += b0 * vec4(a3.x);
299 acc3 += b1 * vec4(a3.y);
300#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
301 }
302
zhenglin19e91422018-01-03 12:14:13 +0800303 for(; int(CURRENT_ITEM_OFFSET(src0_iter)) < end_row_vec_a; TENSOR_ITERATOR_ADVANCE(src0_iter, 1), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
Anthony Barbier7068f992017-10-26 15:23:08 +0100304 {
305 // Load values from matrix A
zhenglin19e91422018-01-03 12:14:13 +0800306 float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100307#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800308 float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
309 //float a1 = 0;
Anthony Barbier7068f992017-10-26 15:23:08 +0100310#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
311#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800312 float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Anthony Barbier7068f992017-10-26 15:23:08 +0100313#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
314#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800315 float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Anthony Barbier7068f992017-10-26 15:23:08 +0100316#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
317
zhenglin19e91422018-01-03 12:14:13 +0800318 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100319
320 acc0 += b0 * vec4(a0);
321#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
322 acc1 += b0 * vec4(a1);
323#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
324#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
325 acc2 += b0 * vec4(a2);
326#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
327#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
328 acc3 += b0 * vec4(a3);
329#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
330 }
331
332 /* Multiply by the weight of vector-matrix product */
333 acc0 = acc0 * vec4(ALPHA);
zhenglin19e91422018-01-03 12:14:13 +0800334 VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
Anthony Barbier7068f992017-10-26 15:23:08 +0100335#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
336 acc1 = acc1 * vec4(ALPHA);
zhenglin19e91422018-01-03 12:14:13 +0800337 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
Anthony Barbier7068f992017-10-26 15:23:08 +0100338#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
339#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
340 acc2 = acc2 * vec4(ALPHA);
zhenglin19e91422018-01-03 12:14:13 +0800341 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
Anthony Barbier7068f992017-10-26 15:23:08 +0100342#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
343#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
344 acc3 = acc3 * vec4(ALPHA);
zhenglin19e91422018-01-03 12:14:13 +0800345 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
Anthony Barbier7068f992017-10-26 15:23:08 +0100346#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
347}
348#endif /* GEMM_MM_FLOATING_POINT */
349
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +0100350#ifdef GEMM_MM_FLOATING_POINT_BIFROST
351/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
352 * Matrix A and matrix B in case both matrices have not been reshaped
353 *
354 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
355 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
356 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
357 *
358 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
359 * @param[in] src0_attrs The attributes of the source matrix
360 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
361 * @param[in] src1_attrs The attributes of the source matrix
362 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
363 * @param[in] dst_attrs The attributes of the destination matrix
364 */
365SHADER_PARAMS_DECLARATION
366{
367 ImageAttributes src0_attrs;
368 ImageAttributes src1_attrs;
369 ImageAttributes dst_attrs;
370};
371TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
372TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
373TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
374
375void main()
376{
377 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
378 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
379 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
380
381 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
382 /* Compute the address for the vector A and matrix B */
383 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
384 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
385
386 /* Reset accumulators */
387 vec4 acc0 = vec4(0.0f);
388#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
389 vec4 acc1 = vec4(0.0f);
390#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
391#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
392 vec4 acc2 = vec4(0.0f);
393#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
394#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
395 vec4 acc3 = vec4(0.0f);
396#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
397
398 // A and B src indices get incremented at the same time.
399 int i = 0;
400 for(; i <= (COLS_A - 4); i += 4)
401 {
402 // Load values from matrix A and matrix B
403 vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
404#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
405 vec4 a1 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
406#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
407#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
408 vec4 a2 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
409#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
410#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
411 vec4 a3 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
412#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
413 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
414 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
415
416 // Multiply and accumulate
417 acc0 += b0 * vec4(a0.x);
418#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
419 acc1 += b0 * vec4(a1.x);
420#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
421#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
422 acc2 += b0 * vec4(a2.x);
423#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
424#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
425 acc3 += b0 * vec4(a3.x);
426#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
427
428 // Load values from matrix B
429 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
430 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
431
432 // Multiply and accumulate
433 acc0 += b0 * vec4(a0.y);
434#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
435 acc1 += b0 * vec4(a1.y);
436#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
437#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
438 acc2 += b0 * vec4(a2.y);
439#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
440#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
441 acc3 += b0 * vec4(a3.y);
442#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
443
444 // Load values from matrix B
445 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
446 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
447
448 // Multiply and accumulate
449 acc0 += b0 * vec4(a0.z);
450#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
451 acc1 += b0 * vec4(a1.z);
452#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
453#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
454 acc2 += b0 * vec4(a2.z);
455#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
456#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
457 acc3 += b0 * vec4(a3.z);
458#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
459
460 // Load values from matrix B
461 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
462 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
463
464 // Multiply and accumulate
465 acc0 += b0 * vec4(a0.w);
466#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
467 acc1 += b0 * vec4(a1.w);
468#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
469#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
470 acc2 += b0 * vec4(a2.w);
471#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
472#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
473 acc3 += b0 * vec4(a3.w);
474#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
475
476 TENSOR_ITERATOR_ADVANCE(src0_iter, 4);
477 }
478
479 for(; i < COLS_A; ++i)
480 {
481 // Load values from matrix A
482 float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
483#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
484 float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
485#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
486#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
487 float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
488#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
489#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
490 float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
491#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
492 vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
493
494 // Multiply and accumulate
495 acc0 += b0 * vec4(a0);
496#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
497 acc1 += b0 * vec4(a1);
498#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
499#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
500 acc2 += b0 * vec4(a2);
501#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
502#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
503 acc3 += b0 * vec4(a3);
504#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
505
506 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
507 TENSOR_ITERATOR_ADVANCE(src0_iter, 1);
508 }
509
510 /* Multiply by the weight of vector-matrix product */
511 acc0 = acc0 * vec4(ALPHA);
512 VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
513#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
514 acc1 = acc1 * vec4(ALPHA);
515 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
516#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
517#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
518 acc2 = acc2 * vec4(ALPHA);
519 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
520#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
521#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
522 acc3 = acc3 * vec4(ALPHA);
523 VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
524#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
525}
526#endif /* GEMM_MM_FLOATING_POINT_BIFROST */
527
Anthony Barbier7068f992017-10-26 15:23:08 +0100528#ifdef GEMM_MATRIXADDITION
Anthony Barbier7068f992017-10-26 15:23:08 +0100529/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
530 *
531 * @attention The beta's value need to be passed at compile time using BETA
532 *
zhenglin19e91422018-01-03 12:14:13 +0800533 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
534 * @param[in] src_attrs The attributes of the source matrix
535 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
536 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +0100537 */
zhenglin19e91422018-01-03 12:14:13 +0800538SHADER_PARAMS_DECLARATION
539{
540 ImageAttributes src_attrs;
541 ImageAttributes dst_attrs;
542};
543TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
544TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
545
Anthony Barbier7068f992017-10-26 15:23:08 +0100546void main(void)
547{
548 /* Compute source and destination addresses */
zhenglin19e91422018-01-03 12:14:13 +0800549 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
550 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100551
552 /* Load values from A x B */
zhenglin19e91422018-01-03 12:14:13 +0800553 vec4 alpha_ab = VLOAD4_CURRENT_ITEM(vec4, dst_ptr, dst_iter);
554 vec4 c = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100555
556 /* Computes alpha * axb + beta * c */
zhenglin19e91422018-01-03 12:14:13 +0800557 vec4 out1 = alpha_ab + vec4(float(BETA) * c);
Anthony Barbier7068f992017-10-26 15:23:08 +0100558
559 /* Store final result in axb matrix */
zhenglin19e91422018-01-03 12:14:13 +0800560 VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, out1);
Anthony Barbier7068f992017-10-26 15:23:08 +0100561}
562#endif /* GEMM_MATRIXADDITION */
zhenglin19e91422018-01-03 12:14:13 +0800563
Anthony Barbier7068f992017-10-26 15:23:08 +0100564#elif defined(DATA_TYPE_FP16)
zhenglin19e91422018-01-03 12:14:13 +0800565
Stephen Lie855c232018-01-04 14:13:22 +0800566#ifdef GEMM_TRANSPOSE1xW
567/** This OpenGL ES kernel computes the "vector" 1x8 transposition of input matrix
568 *
569 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
570 * @param[in] src_attrs The attributes of the source matrix
571 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
572 * @param[in] dst_attrs The attributes of the destination matrix
573 */
574SHADER_PARAMS_DECLARATION
575{
576 ImageAttributes src_attrs;
577 ImageAttributes dst_attrs;
578};
579TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
580TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
581
582void main(void)
583{
584 /* Compute address for Matrix B - source */
585 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
586 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
587
588 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
589 TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
590
591 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
592}
593#endif /* GEMM_TRANSPOSE1xW */
594
595#ifdef GEMM_INTERLEAVE4x4
596/** This OpenGLES kernel reshapes the input matrix interleaving the values
597 *
598 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
599 * @param[in] src_attrs The attributes of the source matrix
600 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
601 * @param[in] dst_attrs The attributes of the destination matrix
602 */
603SHADER_PARAMS_DECLARATION
604{
605 ImageAttributes src_attrs;
606 ImageAttributes dst_attrs;
607};
608TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
609TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
610
611void main(void)
612{
613 /* Compute source and destination addresses */
614 ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
615 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
616
617 vec4 s0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
618 vec4 s1[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
619 vec4 s2[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
620 vec4 s3[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
621
622 vec4 s[2];
623 s[0] = vec4(s0[0].x, s1[0].x, s2[0].x, s3[0].x);
624 s[1] = vec4(s0[0].y, s1[0].y, s2[0].y, s3[0].y);
625 STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
626
627 s[0] = vec4(s0[0].z, s1[0].z, s2[0].z, s3[0].z);
628 s[1] = vec4(s0[0].w, s1[0].w, s2[0].w, s3[0].w);
629 STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 1u), s);
630
631 s[0] = vec4(s0[1].x, s1[1].x, s2[1].x, s3[1].x);
632 s[1] = vec4(s0[1].y, s1[1].y, s2[1].y, s3[1].y);
633 STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 2u), s);
634
635 s[0] = vec4(s0[1].z, s1[1].z, s2[1].z, s3[1].z);
636 s[1] = vec4(s0[1].w, s1[1].w, s2[1].w, s3[1].w);
637 STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 3u), s);
638}
639#endif /* GEMM_INTERLEAVE4x4 */
640
Anthony Barbier7068f992017-10-26 15:23:08 +0100641#ifdef GEMM_MM_FLOATING_POINT
zhenglin19e91422018-01-03 12:14:13 +0800642/** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
643 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier7068f992017-10-26 15:23:08 +0100644 *
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +0100645 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Anthony Barbier7068f992017-10-26 15:23:08 +0100646 *
zhenglin19e91422018-01-03 12:14:13 +0800647 * @param[in] src0_ptr Pointer to the source matrix.Supported data types: F16
648 * @param[in] src0_attrs The attributes of the source matrix
649 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
650 * @param[in] src1_attrs The attributes of the source matrix
651 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
652 * @param[in] dst_attrs The attributes of the destination matrix
Anthony Barbier7068f992017-10-26 15:23:08 +0100653 */
zhenglin19e91422018-01-03 12:14:13 +0800654SHADER_PARAMS_DECLARATION
655{
656 ImageAttributes src0_attrs;
657 ImageAttributes src1_attrs;
658 ImageAttributes dst_attrs;
659};
660
661#if defined(MM_PROCESS_4X)
662TENSOR_DECLARATION(1, src0Buffer, uint, src0_ptr, src0_shift, 2, readonly);
663TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
664TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
665
Anthony Barbier7068f992017-10-26 15:23:08 +0100666void main()
667{
zhenglin19e91422018-01-03 12:14:13 +0800668 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
669 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
670 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100671
672 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
673 /* Compute the address for the vector A and matrix B */
zhenglin19e91422018-01-03 12:14:13 +0800674 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
675 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
Anthony Barbier7068f992017-10-26 15:23:08 +0100676
677 /* Compute end row address for matrix A */
zhenglin19e91422018-01-03 12:14:13 +0800678 uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
Anthony Barbier7068f992017-10-26 15:23:08 +0100679
680 /* Reset accumulators */
681 vec4 acc0 = vec4(0.0f);
Frank Leib9d38ee2017-12-05 10:43:33 +0800682#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
683 vec4 acc1 = vec4(0.0f);
684#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
685#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
686 vec4 acc2 = vec4(0.0f);
687#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
688#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
689 vec4 acc3 = vec4(0.0f);
690#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100691
zhenglin2219dea2018-01-30 18:15:52 +0800692 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(4));
zhenglin19e91422018-01-03 12:14:13 +0800693 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
Anthony Barbier7068f992017-10-26 15:23:08 +0100694 {
zhenglin19e91422018-01-03 12:14:13 +0800695 vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
Frank Leib9d38ee2017-12-05 10:43:33 +0800696#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800697 vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
Frank Leib9d38ee2017-12-05 10:43:33 +0800698#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
699#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800700 vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Frank Leib9d38ee2017-12-05 10:43:33 +0800701#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
702#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800703 vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Frank Leib9d38ee2017-12-05 10:43:33 +0800704#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100705
zhenglin19e91422018-01-03 12:14:13 +0800706 vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
707 vec4 b1 = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
Anthony Barbier7068f992017-10-26 15:23:08 +0100708
709 acc0 += b0 * vec4(a0.x);
710 acc0 += b1 * vec4(a0.y);
Frank Leib9d38ee2017-12-05 10:43:33 +0800711#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
712 acc1 += b0 * vec4(a1.x);
713 acc1 += b1 * vec4(a1.y);
714#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
715#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
716 acc2 += b0 * vec4(a2.x);
717 acc2 += b1 * vec4(a2.y);
718#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
719#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
720 acc3 += b0 * vec4(a3.x);
721 acc3 += b1 * vec4(a3.y);
722#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100723 }
724
zhenglin19e91422018-01-03 12:14:13 +0800725 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
Anthony Barbier7068f992017-10-26 15:23:08 +0100726 {
zhenglin19e91422018-01-03 12:14:13 +0800727 vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
Frank Leib9d38ee2017-12-05 10:43:33 +0800728#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800729 vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
Frank Leib9d38ee2017-12-05 10:43:33 +0800730#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
731#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin2219dea2018-01-30 18:15:52 +0800732 vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Frank Leib9d38ee2017-12-05 10:43:33 +0800733#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
734#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800735 vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Frank Leib9d38ee2017-12-05 10:43:33 +0800736#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100737
zhenglin19e91422018-01-03 12:14:13 +0800738 vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100739
740 acc0 += b0 * (a0.x);
Frank Leib9d38ee2017-12-05 10:43:33 +0800741#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
742 acc1 += b0 * (a1.x);
743#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
745 acc2 += b0 * (a2.x);
746#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
747#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
748 acc3 += b0 * (a3.x);
749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100750 }
751
752 /* Multiply by the weight of vector-matrix product */
753 acc0 = acc0 * vec4(ALPHA);
754
zhenglin19e91422018-01-03 12:14:13 +0800755 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
Frank Leib9d38ee2017-12-05 10:43:33 +0800756#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800757 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
Frank Leib9d38ee2017-12-05 10:43:33 +0800758#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
759#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800760 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
Frank Leib9d38ee2017-12-05 10:43:33 +0800761#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
762#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800763 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
Frank Leib9d38ee2017-12-05 10:43:33 +0800764#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier7068f992017-10-26 15:23:08 +0100765}
Frank Leib9d38ee2017-12-05 10:43:33 +0800766#elif defined(MM_PROCESS_4X_OPTIMIZED) /* PROCESS_4X */
zhenglin19e91422018-01-03 12:14:13 +0800767TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
768TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
769TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
Frank Leib9d38ee2017-12-05 10:43:33 +0800770
Frank Leib9d38ee2017-12-05 10:43:33 +0800771void main()
772{
zhenglin19e91422018-01-03 12:14:13 +0800773 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
774 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
775 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Frank Leib9d38ee2017-12-05 10:43:33 +0800776
777 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
778 /* Compute the address for the vector A and matrix B */
zhenglin19e91422018-01-03 12:14:13 +0800779 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
780 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
Frank Leib9d38ee2017-12-05 10:43:33 +0800781
782 /* Compute end row address for matrix A */
zhenglin19e91422018-01-03 12:14:13 +0800783 uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
Frank Leib9d38ee2017-12-05 10:43:33 +0800784
785 /* Reset accumulators */
786 vec4 acc0 = vec4(0.0f);
787
788#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
789 vec4 acc1 = vec4(0.0f);
790#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
791#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
792 vec4 acc2 = vec4(0.0f);
793#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
794#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
795 vec4 acc3 = vec4(0.0f);
796#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
797
zhenglin2219dea2018-01-30 18:15:52 +0800798 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
zhenglin19e91422018-01-03 12:14:13 +0800799 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
Frank Leib9d38ee2017-12-05 10:43:33 +0800800 {
zhenglin19e91422018-01-03 12:14:13 +0800801 vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
Frank Leib9d38ee2017-12-05 10:43:33 +0800802
803#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800804 vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
Frank Leib9d38ee2017-12-05 10:43:33 +0800805#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
806#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800807 vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Frank Leib9d38ee2017-12-05 10:43:33 +0800808#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
809#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800810 vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Frank Leib9d38ee2017-12-05 10:43:33 +0800811#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
812
zhenglin19e91422018-01-03 12:14:13 +0800813 vec4 b;
Frank Leib9d38ee2017-12-05 10:43:33 +0800814
815 for(int i = 0; i < 8; i++)
816 {
817 int j = i >> 2;
818 int k = i % 4;
819
zhenglin19e91422018-01-03 12:14:13 +0800820 b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
Frank Leib9d38ee2017-12-05 10:43:33 +0800821
822 acc0 += b * vec4(a0[j][k]);
823#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
824 acc1 += b * vec4(a1[j][k]);
825#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
826#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
827 acc2 += b * vec4(a2[j][k]);
828#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
829#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
830 acc3 += b * vec4(a3[j][k]);
831#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
832 }
833 }
834
zhenglin19e91422018-01-03 12:14:13 +0800835 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
Frank Leib9d38ee2017-12-05 10:43:33 +0800836 {
zhenglin19e91422018-01-03 12:14:13 +0800837 vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
Frank Leib9d38ee2017-12-05 10:43:33 +0800838
839#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800840 vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
Frank Leib9d38ee2017-12-05 10:43:33 +0800841#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
842#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800843 vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
Frank Leib9d38ee2017-12-05 10:43:33 +0800844#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
845#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800846 vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
Frank Leib9d38ee2017-12-05 10:43:33 +0800847#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
848
zhenglin19e91422018-01-03 12:14:13 +0800849 vec4 b;
Frank Leib9d38ee2017-12-05 10:43:33 +0800850
851 int leftover = COLS_A % 8;
852
853 for(int i = 0; i < leftover; i++)
854 {
855 int j = i >> 2;
856 int k = i % 4;
857
zhenglin19e91422018-01-03 12:14:13 +0800858 b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
Frank Leib9d38ee2017-12-05 10:43:33 +0800859
860 acc0 += b * vec4(a0[j][k]);
861#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
862 acc1 += b * vec4(a1[j][k]);
863#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
864#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
865 acc2 += b * vec4(a2[j][k]);
866#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
867#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
868 acc3 += b * vec4(a3[j][k]);
869#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
870 }
871 }
872
873 /* Multiply by the weight of vector-matrix product */
874 acc0 = acc0 * vec4(ALPHA);
875
zhenglin19e91422018-01-03 12:14:13 +0800876 STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
Frank Leib9d38ee2017-12-05 10:43:33 +0800877#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
zhenglin19e91422018-01-03 12:14:13 +0800878 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
Frank Leib9d38ee2017-12-05 10:43:33 +0800879#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
880#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
zhenglin19e91422018-01-03 12:14:13 +0800881 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
Frank Leib9d38ee2017-12-05 10:43:33 +0800882#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
883#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
zhenglin19e91422018-01-03 12:14:13 +0800884 STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
Frank Leib9d38ee2017-12-05 10:43:33 +0800885#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
886}
zhenglin19e91422018-01-03 12:14:13 +0800887#elif defined(MM_PROCESS_8X) /* PROCESS_8X */
888TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
889TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
890TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
Frank Leib9d38ee2017-12-05 10:43:33 +0800891
Frank Leib9d38ee2017-12-05 10:43:33 +0800892void main()
893{
zhenglin19e91422018-01-03 12:14:13 +0800894 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
895 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
896 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
Frank Leib9d38ee2017-12-05 10:43:33 +0800897
898 int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
899 /* Compute the address for the vector A and matrix B */
zhenglin19e91422018-01-03 12:14:13 +0800900 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
901 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
Frank Leib9d38ee2017-12-05 10:43:33 +0800902
903 /* Compute end row address for matrix A */
zhenglin19e91422018-01-03 12:14:13 +0800904 uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
Frank Leib9d38ee2017-12-05 10:43:33 +0800905
906 /* Reset accumulators */
907 vec4 acc[2];
908
909 acc[0] = vec4(0.0f);
910 acc[1] = vec4(0.0f);
911
zhenglin2219dea2018-01-30 18:15:52 +0800912 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
zhenglin19e91422018-01-03 12:14:13 +0800913 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
Frank Leib9d38ee2017-12-05 10:43:33 +0800914 {
zhenglin19e91422018-01-03 12:14:13 +0800915 vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
916 vec4 b[2];
Frank Leib9d38ee2017-12-05 10:43:33 +0800917
918 for(int i = 0; i < 8; i++)
919 {
920 int j = i >> 2;
921 int k = i % 4;
922
zhenglin19e91422018-01-03 12:14:13 +0800923 b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
Frank Leib9d38ee2017-12-05 10:43:33 +0800924
925 acc[0] += b[0] * vec4(a[j][k]);
926 acc[1] += b[1] * vec4(a[j][k]);
927 }
928 }
929
zhenglin19e91422018-01-03 12:14:13 +0800930 for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a);
931 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * uint(2)), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
Frank Leib9d38ee2017-12-05 10:43:33 +0800932 {
zhenglin19e91422018-01-03 12:14:13 +0800933 vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
934 vec4 b[2];
Frank Leib9d38ee2017-12-05 10:43:33 +0800935
936 int leftover = COLS_A % 8;
937
938 for(int i = 0; i < leftover; i++)
939 {
940 int j = i >> 2;
941 int k = i % 4;
942
zhenglin19e91422018-01-03 12:14:13 +0800943 b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
Frank Leib9d38ee2017-12-05 10:43:33 +0800944
945 acc[0] += b[0] * vec4(a[j][k]);
946 acc[1] += b[1] * vec4(a[j][k]);
947 }
948 }
949
950 /* Multiply by the weight of vector-matrix product */
951 acc[0] = acc[0] * vec4(ALPHA);
952 acc[1] = acc[1] * vec4(ALPHA);
953
zhenglin19e91422018-01-03 12:14:13 +0800954 STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc);
Frank Leib9d38ee2017-12-05 10:43:33 +0800955}
zhenglin19e91422018-01-03 12:14:13 +0800956#endif /* PROCESS_8X */
Frank Leib9d38ee2017-12-05 10:43:33 +0800957#endif /* GEMM_MM_FLOATING_POINT */
Anthony Barbier7068f992017-10-26 15:23:08 +0100958
959#ifdef GEMM_ACCUMULATE_BIASES
Frank Leib9d38ee2017-12-05 10:43:33 +0800960#if defined(ACCUM_PROCESS_4X)
Anthony Barbier7068f992017-10-26 15:23:08 +0100961/** This kernel accumulates each row with the biases vector
962 *
zhenglin19e91422018-01-03 12:14:13 +0800963 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
964 * @param[in] accum_attrs The attributes of the accumulate tensor
965 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
966 * @param[in] biases_attrs The attributes of the biases tensor
Anthony Barbier7068f992017-10-26 15:23:08 +0100967 */
zhenglin19e91422018-01-03 12:14:13 +0800968SHADER_PARAMS_DECLARATION
969{
970 ImageAttributes accum_attrs;
971 VectorAttributes biases_attrs;
972};
973
974TENSOR_DECLARATION(1, accumBuffer, uvec2, accum_ptr, accum_shift, 3, restrict);
975TENSOR_DECLARATION(2, biasesBuffer, uvec2, biases_ptr, biases_shift, 3, readonly);
976
Anthony Barbier7068f992017-10-26 15:23:08 +0100977void main(void)
978{
zhenglin19e91422018-01-03 12:14:13 +0800979 ImageIterator accum_iter = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
980 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
Anthony Barbier7068f992017-10-26 15:23:08 +0100981
zhenglin19e91422018-01-03 12:14:13 +0800982 vec4 u[2];
983 u[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
984 u[1] = LOAD_UNPACK4_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
Anthony Barbier7068f992017-10-26 15:23:08 +0100985
986 vec4 tmp;
zhenglin19e91422018-01-03 12:14:13 +0800987 tmp = u[0] + u[1];
988 STORE_PACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter, tmp);
Anthony Barbier7068f992017-10-26 15:23:08 +0100989}
zhenglin19e91422018-01-03 12:14:13 +0800990#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_8X */
991SHADER_PARAMS_DECLARATION
Frank Leib9d38ee2017-12-05 10:43:33 +0800992{
zhenglin19e91422018-01-03 12:14:13 +0800993 ImageAttributes accum_attrs;
994 VectorAttributes biases_attrs;
Frank Leib9d38ee2017-12-05 10:43:33 +0800995};
996
zhenglin19e91422018-01-03 12:14:13 +0800997TENSOR_DECLARATION(1, accumBuffer, uvec4, accum_ptr, accum_shift, 4, restrict);
998TENSOR_DECLARATION(2, biasesBuffer, uvec4, biases_ptr, biases_shift, 4, readonly);
999
Frank Leib9d38ee2017-12-05 10:43:33 +08001000void main(void)
1001{
zhenglin19e91422018-01-03 12:14:13 +08001002 ImageIterator accum_iter = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
1003 VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
Frank Leib9d38ee2017-12-05 10:43:33 +08001004
zhenglin19e91422018-01-03 12:14:13 +08001005 vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
zhenglin2219dea2018-01-30 18:15:52 +08001006 vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
Frank Leib9d38ee2017-12-05 10:43:33 +08001007
1008 vec4 r[2];
zhenglin19e91422018-01-03 12:14:13 +08001009 r[0] = u[0] + v[0];
1010 r[1] = u[1] + v[1];
1011 STORE_PACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter, r);
Frank Leib9d38ee2017-12-05 10:43:33 +08001012}
zhenglin19e91422018-01-03 12:14:13 +08001013#endif /* ACCUM_PROCESS_8X */
Frank Leib9d38ee2017-12-05 10:43:33 +08001014#endif /* GEMM_ACCUMULATE_BIASES */
Stephen Lie855c232018-01-04 14:13:22 +08001015
1016#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED
1017/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
1018 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
1019 *
Michele Di Giorgiof6f08da2018-04-26 10:24:30 +01001020 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
Stephen Lie855c232018-01-04 14:13:22 +08001021 *
1022 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
1023 * @param[in] src0_attrs The attributes of the source matrix
1024 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1025 * @param[in] src1_attrs The attributes of the source matrix
1026 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1027 * @param[in] dst_attrs The attributes of the destination matrix
1028 */
1029SHADER_PARAMS_DECLARATION
1030{
1031 ImageAttributes src0_attrs;
1032 ImageAttributes src1_attrs;
1033 ImageAttributes dst_attrs;
1034};
1035TENSOR_DECLARATION(1, src0Buffer, uvec2, src0_ptr, src0_shift, 3, readonly);
1036TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
1037TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
1038
1039void main()
1040{
1041 ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
1042 ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
1043 ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
1044
1045 /* Compute address for matrix A and B */
1046 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
1047 TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
1048 /* Compute end row address for matrix B */
1049 int end_row_mtx_b = (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) + int(COLS_B);
1050
1051 /* Reset accumulators */
1052 vec4 c00[2];
1053 vec4 c10[2];
1054 vec4 c20[2];
1055 vec4 c30[2];
1056 c00[0] = vec4(0.0f);
1057 c00[1] = vec4(0.0f);
1058 c10[0] = vec4(0.0f);
1059 c10[1] = vec4(0.0f);
1060 c20[0] = vec4(0.0f);
1061 c20[1] = vec4(0.0f);
1062 c30[0] = vec4(0.0f);
1063 c30[1] = vec4(0.0f);
1064
1065 // FIXME: loop unrolling really needed for GLES?
1066 for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
1067 {
1068 /* Load values from matrix A (interleaved) and matrix B (transposed) */
1069 vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
1070 vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
1071
1072 c00[0] += vec4(a0.x) * b0[0];
1073 c00[1] += vec4(a0.x) * b0[1];
1074 c10[0] += vec4(a0.y) * b0[0];
1075 c10[1] += vec4(a0.y) * b0[1];
1076 c20[0] += vec4(a0.z) * b0[0];
1077 c20[1] += vec4(a0.z) * b0[1];
1078 c30[0] += vec4(a0.w) * b0[0];
1079 c30[1] += vec4(a0.w) * b0[1];
1080
1081 /* Load values from matrix A (interleaved) and matrix B (transposed) */
1082 a0 = LOAD_UNPACK4_HALF(src0_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, 8));
1083 b0 = LOAD_UNPACK8_HALF(src1_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src1_iter, 16));
1084
1085 c00[0] += vec4(a0.x) * b0[0];
1086 c00[1] += vec4(a0.x) * b0[1];
1087 c10[0] += vec4(a0.y) * b0[0];
1088 c10[1] += vec4(a0.y) * b0[1];
1089 c20[0] += vec4(a0.z) * b0[0];
1090 c20[1] += vec4(a0.z) * b0[1];
1091 c30[0] += vec4(a0.w) * b0[0];
1092 c30[1] += vec4(a0.w) * b0[1];
1093 }
1094
1095 for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 16))
1096 {
1097 /* Load values from matrix A (interleaved) and matrix B (transposed) */
1098 vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
1099 vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
1100
1101 c00[0] += vec4(a0.x) * b0[0];
1102 c00[1] += vec4(a0.x) * b0[1];
1103 c10[0] += vec4(a0.y) * b0[0];
1104 c10[1] += vec4(a0.y) * b0[1];
1105 c20[0] += vec4(a0.z) * b0[0];
1106 c20[1] += vec4(a0.z) * b0[1];
1107 c30[0] += vec4(a0.w) * b0[0];
1108 c30[1] += vec4(a0.w) * b0[1];
1109 }
1110
1111 /* Multiply by the weight of matrix product */
1112 c00[0] = c00[0] * vec4(ALPHA);
1113 c00[1] = c00[1] * vec4(ALPHA);
1114 c10[0] = c10[0] * vec4(ALPHA);
1115 c10[1] = c10[1] * vec4(ALPHA);
1116 c20[0] = c20[0] * vec4(ALPHA);
1117 c20[1] = c20[1] * vec4(ALPHA);
1118 c30[0] = c30[0] * vec4(ALPHA);
1119 c30[1] = c30[1] * vec4(ALPHA);
1120
1121 /* Store 4x8 block */
1122 STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
1123 STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
1124 STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
1125 STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
1126}
1127#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
1128#else /* DATA_TYPE_FP16 */
Anthony Barbier7068f992017-10-26 15:23:08 +01001129#error Data type not supported
Joel Liangf1f3ebd2017-11-10 09:59:19 +08001130#endif /* DATA_TYPE_FP32 */