blob: 7ac421b7b6ee220af77c3326b91263bc9006cced [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010024#include "fixed_point.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025#include "helpers.h"
26
27/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
28 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010029 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
31 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
32 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
33 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
34 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010035 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
37 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
38 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
39 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
40 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
41 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010042__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
43 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044{
45 uint x = get_global_id(0);
46 uint y = get_global_id(1);
47
48 /* Compute address for Matrix B - source */
49 Image src = CONVERT_TO_IMAGE_STRUCT(src);
50
51 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
52 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
53
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010054 uint4 b0 = vload4(0, (__global uint *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010056 vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010057}
58
59/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
60 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010061 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
Anthony Barbier6ff3b192017-09-04 18:44:23 +010062 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
63 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
64 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
65 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
66 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010067 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010068 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
69 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
70 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
71 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
72 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
73 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010074__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
75 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076{
77 uint x = get_global_id(0);
78 uint y = get_global_id(1);
79
80 /* Compute address for Matrix B - source */
81 Image src = CONVERT_TO_IMAGE_STRUCT(src);
82
83 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
84 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
85
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010086 ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010087
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010088 vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010089}
90
91/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
92 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010093 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
Anthony Barbier6ff3b192017-09-04 18:44:23 +010094 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
95 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
96 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
97 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
98 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010099 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
101 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
102 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
103 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
104 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
105 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +0100106__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
107 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108{
109 uint x = get_global_id(0);
110 uint y = get_global_id(1);
111
112 /* Compute address for Matrix B - source */
113 Image src = CONVERT_TO_IMAGE_STRUCT(src);
114
115 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
116 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
117
118 uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
119
120 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
121}
122
123/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
124 *
125 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
126 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
127 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
128 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
129 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
130 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100131 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100132 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
133 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
134 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
135 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
136 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
137 */
138__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
139 IMAGE_DECLARATION(dst))
140{
141 /* Compute source and destination addresses */
142 Image src = CONVERT_TO_IMAGE_STRUCT(src);
143 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
144
145 /* Load values from Matrix A */
146 float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
147 float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
148 float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
149 float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
150
151 float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
152 vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
153
154 val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
155 vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
156
157 val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
158 vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
159
160 val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
161 vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
162}
163
164/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
165 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100166 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100167 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
168 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
170 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100172 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
174 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
175 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
176 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
177 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
178 */
179__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
180 IMAGE_DECLARATION(dst))
181{
182 /* Compute source and destination addresses */
183 Image src = CONVERT_TO_IMAGE_STRUCT(src);
184 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
185
186 /* Load values from Matrix A */
187 half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
188 half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
189 half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
190 half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
191
192 half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
193 vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
194
195 val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
196 vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
197
198 val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
199 vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
200
201 val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
202 vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
203}
204
205/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
206 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100207 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100208 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
209 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
210 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
211 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
212 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100213 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
215 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
216 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
217 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
218 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
219 */
220__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
221 IMAGE_DECLARATION(dst))
222{
223 /* Compute source and destination addresses */
224 Image src = CONVERT_TO_IMAGE_STRUCT(src);
225 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
226
227 /* Load values from Matrix A */
228 uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
229 uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
230 uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
231 uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
232
233 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
234 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
235 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
236
237 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
238 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
239 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
240
241 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
242 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
243 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
244
245 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
246 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
247 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
248}
249
250/** This kernel accumulates each row with the biases vector
251 *
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100252 * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
253 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100254 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100255 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
256 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
257 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
258 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
259 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100260 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100261 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
262 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
263 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
264 */
Anthony Barbierac69aa12017-07-03 17:39:37 +0100265#ifdef DATA_TYPE
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100266__kernel void gemm_accumulate_biases(
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267 IMAGE_DECLARATION(accum),
268 VECTOR_DECLARATION(biases))
269{
270 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
271 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
272
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100273 VEC_DATA_TYPE(DATA_TYPE, 16)
274 accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
275 VEC_DATA_TYPE(DATA_TYPE, 16)
276 biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
277 accum_value = biases_value + accum_value;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100278
279 // Store result in the accummulate buffer
Gian Marco Iodice578ab612017-06-23 09:34:33 +0100280 vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100281}
Anthony Barbierac69aa12017-07-03 17:39:37 +0100282#endif /* DATA_TYPE */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283
Anthony Barbierac69aa12017-07-03 17:39:37 +0100284#ifdef WIDTH_MATRIX_B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100286 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100287 *
288 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
289 *
290 * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
291 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
292 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
293 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
294 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
295 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100296 * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100297 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
298 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
299 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
300 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
301 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100302 * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
304 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
305 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
306 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
307 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
308 * @param[in] a_offset Offset to be added to each element of the matrix A
309 * @param[in] b_offset Offset to be added to each element of the matrix B.
310 * @param[in] c_offset Offset to be added to each element of the matrix C.
311 * @param[in] c_mult_int Multiplied with each element of the matrix C.
312 * @param[in] shift Number of bits to shift right the result.
313 */
314__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
315 IMAGE_DECLARATION(src1),
316 IMAGE_DECLARATION(dst),
317 int a_offset,
318 int b_offset,
319 int c_offset,
320 int c_mult_int,
321 int shift)
322{
323 /* src_addr.s0 = address of matrix A */
324 /* src_addr.s1 = address of matrix B */
325
326 /* Compute address for matrix A and B */
327 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
328 (src1_stride_y));
329
330 /* Add offset_first_element_in_bytes */
331 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
332
333 /* Compute end row address for matrix B */
334 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
335
336 /* Reset accumulators */
337 int16 c00 = 0.0f;
338 int16 c10 = 0.0f;
339 int16 c20 = 0.0f;
340 int16 c30 = 0.0f;
341
342 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
343 {
344 /* Load values from matrix A (interleaved) and matrix B (transposed) */
345 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
346 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
347
348 c00 += (int16)a0.s0 * b0;
349 c10 += (int16)a0.s1 * b0;
350 c20 += (int16)a0.s2 * b0;
351 c30 += (int16)a0.s3 * b0;
352
353 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
354
355 c00 += (int16)a0.s4 * b1;
356 c10 += (int16)a0.s5 * b1;
357 c20 += (int16)a0.s6 * b1;
358 c30 += (int16)a0.s7 * b1;
359 }
360
361 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
362 {
363 /* Load values from matrix A (interleaved) and matrix B (transposed) */
364 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
365 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
366
367 c00 += (int16)a0.s0 * b0;
368 c10 += (int16)a0.s1 * b0;
369 c20 += (int16)a0.s2 * b0;
370 c30 += (int16)a0.s3 * b0;
371 }
372
373 /* Compute destination address */
374 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
375
376 /* Multiply by the weight of matrix product */
377 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
378 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
379 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
380 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
381
382 /* Store 4x16 block */
383 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
384 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
385 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
386 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
387}
Anthony Barbierac69aa12017-07-03 17:39:37 +0100388#endif /* WIDTH_MATRIX_B */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100389
Anthony Barbierac69aa12017-07-03 17:39:37 +0100390#if defined(WIDTH_MATRIX_B) && defined(ALPHA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100391/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100392 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100393 *
394 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
395 *
396 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
397 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
398 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
399 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
400 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
401 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100402 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
404 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
405 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
406 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
407 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100408 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100409 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
410 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
411 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
412 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
413 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
414 */
415__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
416 IMAGE_DECLARATION(src1),
417 IMAGE_DECLARATION(dst))
418{
419 /* src_addr.s0 = address of matrix A */
420 /* src_addr.s1 = address of matrix B */
421
422 /* Compute address for matrix A and B */
423 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
424 (src1_stride_y));
425
426 /* Add offset_first_element_in_bytes */
427 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
428
429 /* Divide by 4 in order to get the src_addr in unit of float */
430 src_addr = src_addr >> 2;
431
432 /* Compute end row address for matrix B */
433 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
434
435 /* Reset accumulators */
436 float4 c00 = 0.0f;
437 float4 c10 = 0.0f;
438 float4 c20 = 0.0f;
439 float4 c30 = 0.0f;
440
441 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
442 {
443 /* Load values from matrix A (interleaved) and matrix B (transposed) */
444 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
445 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
446
447 c00 += (float4)a0.s0 * b0;
448 c10 += (float4)a0.s1 * b0;
449 c20 += (float4)a0.s2 * b0;
450 c30 += (float4)a0.s3 * b0;
451
452 /* Load values from matrix A (interleaved) and matrix B (transposed) */
453 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
454 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
455
456 c00 += (float4)a0.s0 * b0;
457 c10 += (float4)a0.s1 * b0;
458 c20 += (float4)a0.s2 * b0;
459 c30 += (float4)a0.s3 * b0;
460 }
461
462 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
463 {
464 /* Load values from matrix A (interleaved) and matrix B (transposed) */
465 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
466 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
467
468 c00 += (float4)a0.s0 * b0;
469 c10 += (float4)a0.s1 * b0;
470 c20 += (float4)a0.s2 * b0;
471 c30 += (float4)a0.s3 * b0;
472 }
473
474 /* Compute destination address */
475 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
476
477 /* Multiply by the weight of matrix product */
478 c00 = c00 * (float4)ALPHA;
479 c10 = c10 * (float4)ALPHA;
480 c20 = c20 * (float4)ALPHA;
481 c30 = c30 * (float4)ALPHA;
482
483 /* Store 4x4 block */
484 vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
485 vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
486 vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
487 vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
488}
489
490/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100491 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100492 *
493 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
494 *
495 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
496 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
497 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
498 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
499 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
500 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100501 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
503 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
504 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
505 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
506 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100507 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100508 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
509 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
510 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
511 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
512 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
513 */
514__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
515 IMAGE_DECLARATION(src1),
516 IMAGE_DECLARATION(dst))
517{
518 // src_addr_a = address of matrix A
519 // src_addr_b = address of matrix B
520 __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
521 __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
522
523 // Compute end row address for matrix B
524 __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
525
526 // Reset accumulators
527 float c00 = 0.0f;
528 float c01 = 0.0f;
529 float c02 = 0.0f;
530 float c03 = 0.0f;
531 float c10 = 0.0f;
532 float c11 = 0.0f;
533 float c12 = 0.0f;
534 float c13 = 0.0f;
535 float c20 = 0.0f;
536 float c21 = 0.0f;
537 float c22 = 0.0f;
538 float c23 = 0.0f;
539 float c30 = 0.0f;
540 float c31 = 0.0f;
541 float c32 = 0.0f;
542 float c33 = 0.0f;
543
544 for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
545 {
546 // Load values from matrix A (interleaved) and matrix B (transposed)
547 float4 a0 = vload4(0, src_addr_a);
548 float4 b0 = vload4(0, src_addr_b);
549
550 c00 = fma(a0.s0, b0.s0, c00);
551 c01 = fma(a0.s0, b0.s1, c01);
552 c02 = fma(a0.s0, b0.s2, c02);
553 c03 = fma(a0.s0, b0.s3, c03);
554
555 c10 = fma(a0.s1, b0.s0, c10);
556 c11 = fma(a0.s1, b0.s1, c11);
557 c12 = fma(a0.s1, b0.s2, c12);
558 c13 = fma(a0.s1, b0.s3, c13);
559
560 c20 = fma(a0.s2, b0.s0, c20);
561 c21 = fma(a0.s2, b0.s1, c21);
562 c22 = fma(a0.s2, b0.s2, c22);
563 c23 = fma(a0.s2, b0.s3, c23);
564
565 c30 = fma(a0.s3, b0.s0, c30);
566 c31 = fma(a0.s3, b0.s1, c31);
567 c32 = fma(a0.s3, b0.s2, c32);
568 c33 = fma(a0.s3, b0.s3, c33);
569
570 // Load values from matrix A (interleaved) and matrix B (transposed)
571 a0 = vload4(0, src_addr_a + 4);
572 b0 = vload4(0, src_addr_b + 4);
573
574 c00 = fma(a0.s0, b0.s0, c00);
575 c01 = fma(a0.s0, b0.s1, c01);
576 c02 = fma(a0.s0, b0.s2, c02);
577 c03 = fma(a0.s0, b0.s3, c03);
578
579 c10 = fma(a0.s1, b0.s0, c10);
580 c11 = fma(a0.s1, b0.s1, c11);
581 c12 = fma(a0.s1, b0.s2, c12);
582 c13 = fma(a0.s1, b0.s3, c13);
583
584 c20 = fma(a0.s2, b0.s0, c20);
585 c21 = fma(a0.s2, b0.s1, c21);
586 c22 = fma(a0.s2, b0.s2, c22);
587 c23 = fma(a0.s2, b0.s3, c23);
588
589 c30 = fma(a0.s3, b0.s0, c30);
590 c31 = fma(a0.s3, b0.s1, c31);
591 c32 = fma(a0.s3, b0.s2, c32);
592 c33 = fma(a0.s3, b0.s3, c33);
593
594 // Load values from matrix A (interleaved) and matrix B (transposed)
595 a0 = vload4(0, src_addr_a + 8);
596 b0 = vload4(0, src_addr_b + 8);
597
598 c00 = fma(a0.s0, b0.s0, c00);
599 c01 = fma(a0.s0, b0.s1, c01);
600 c02 = fma(a0.s0, b0.s2, c02);
601 c03 = fma(a0.s0, b0.s3, c03);
602
603 c10 = fma(a0.s1, b0.s0, c10);
604 c11 = fma(a0.s1, b0.s1, c11);
605 c12 = fma(a0.s1, b0.s2, c12);
606 c13 = fma(a0.s1, b0.s3, c13);
607
608 c20 = fma(a0.s2, b0.s0, c20);
609 c21 = fma(a0.s2, b0.s1, c21);
610 c22 = fma(a0.s2, b0.s2, c22);
611 c23 = fma(a0.s2, b0.s3, c23);
612
613 c30 = fma(a0.s3, b0.s0, c30);
614 c31 = fma(a0.s3, b0.s1, c31);
615 c32 = fma(a0.s3, b0.s2, c32);
616 c33 = fma(a0.s3, b0.s3, c33);
617
618 // Load values from matrix A (interleaved) and matrix B (transposed)
619 a0 = vload4(0, src_addr_a + 12);
620 b0 = vload4(0, src_addr_b + 12);
621
622 c00 = fma(a0.s0, b0.s0, c00);
623 c01 = fma(a0.s0, b0.s1, c01);
624 c02 = fma(a0.s0, b0.s2, c02);
625 c03 = fma(a0.s0, b0.s3, c03);
626
627 c10 = fma(a0.s1, b0.s0, c10);
628 c11 = fma(a0.s1, b0.s1, c11);
629 c12 = fma(a0.s1, b0.s2, c12);
630 c13 = fma(a0.s1, b0.s3, c13);
631
632 c20 = fma(a0.s2, b0.s0, c20);
633 c21 = fma(a0.s2, b0.s1, c21);
634 c22 = fma(a0.s2, b0.s2, c22);
635 c23 = fma(a0.s2, b0.s3, c23);
636
637 c30 = fma(a0.s3, b0.s0, c30);
638 c31 = fma(a0.s3, b0.s1, c31);
639 c32 = fma(a0.s3, b0.s2, c32);
640 c33 = fma(a0.s3, b0.s3, c33);
641 }
642
643 for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
644 {
645 // Load values from matrix A (interleaved) and matrix B (transposed)
646 float4 a0 = vload4(0, src_addr_a);
647 float4 b0 = vload4(0, src_addr_b);
648
649 c00 = fma(a0.s0, b0.s0, c00);
650 c01 = fma(a0.s0, b0.s1, c01);
651 c02 = fma(a0.s0, b0.s2, c02);
652 c03 = fma(a0.s0, b0.s3, c03);
653
654 c10 = fma(a0.s1, b0.s0, c10);
655 c11 = fma(a0.s1, b0.s1, c11);
656 c12 = fma(a0.s1, b0.s2, c12);
657 c13 = fma(a0.s1, b0.s3, c13);
658
659 c20 = fma(a0.s2, b0.s0, c20);
660 c21 = fma(a0.s2, b0.s1, c21);
661 c22 = fma(a0.s2, b0.s2, c22);
662 c23 = fma(a0.s2, b0.s3, c23);
663
664 c30 = fma(a0.s3, b0.s0, c30);
665 c31 = fma(a0.s3, b0.s1, c31);
666 c32 = fma(a0.s3, b0.s2, c32);
667 c33 = fma(a0.s3, b0.s3, c33);
668 }
669
670 // Compute destination address
671 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
672
673 // Multiply by the weight of matrix product
674 c00 = c00 * ALPHA;
675 c01 = c01 * ALPHA;
676 c02 = c02 * ALPHA;
677 c03 = c03 * ALPHA;
678 c10 = c10 * ALPHA;
679 c11 = c11 * ALPHA;
680 c12 = c12 * ALPHA;
681 c13 = c13 * ALPHA;
682 c20 = c20 * ALPHA;
683 c21 = c21 * ALPHA;
684 c22 = c22 * ALPHA;
685 c23 = c23 * ALPHA;
686 c30 = c30 * ALPHA;
687 c31 = c31 * ALPHA;
688 c32 = c32 * ALPHA;
689 c33 = c33 * ALPHA;
690
691 barrier(CLK_GLOBAL_MEM_FENCE);
692
693 // Store 4x4 block
694 vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
695 vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
696 vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
697 vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
698}
699
700/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100701 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100702 *
703 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
704 *
705 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
706 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
707 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
708 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
709 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
710 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100711 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100712 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
713 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
714 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
715 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
716 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100717 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100718 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
719 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
720 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
721 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
722 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
723 */
724__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
725 IMAGE_DECLARATION(src1),
726 IMAGE_DECLARATION(dst))
727{
728 /* src_addr.s0 = address of matrix A */
729 /* src_addr.s1 = address of matrix B */
730
731 /* Compute address for matrix A and B */
732 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
733 (src1_stride_y));
734
735 /* Add offset_first_element_in_bytes */
736 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
737
738 /* Divide by 2 in order to get the src_addr in unit of half */
739 src_addr = src_addr >> 1;
740
741 /* Compute end row address for matrix B */
742 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
743
744 /* Reset accumulators */
745 half8 c00 = 0.0f;
746 half8 c10 = 0.0f;
747 half8 c20 = 0.0f;
748 half8 c30 = 0.0f;
749
750 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
751 {
752 /* Load values from matrix A (interleaved) and matrix B (transposed) */
753 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
754 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
755
756 c00 += (half8)a0.s0 * b0;
757 c10 += (half8)a0.s1 * b0;
758 c20 += (half8)a0.s2 * b0;
759 c30 += (half8)a0.s3 * b0;
760
761 /* Load values from matrix A (interleaved) and matrix B (transposed) */
762 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
763 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
764
765 c00 += (half8)a0.s0 * b0;
766 c10 += (half8)a0.s1 * b0;
767 c20 += (half8)a0.s2 * b0;
768 c30 += (half8)a0.s3 * b0;
769 }
770
771 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
772 {
773 /* Load values from matrix A (interleaved) and matrix B (transposed) */
774 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
775 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
776
777 c00 += (half8)a0.s0 * b0;
778 c10 += (half8)a0.s1 * b0;
779 c20 += (half8)a0.s2 * b0;
780 c30 += (half8)a0.s3 * b0;
781 }
782
783 /* Compute destination address */
784 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
785
786 /* Multiply by the weight of matrix product */
787 c00 = c00 * (half8)ALPHA;
788 c10 = c10 * (half8)ALPHA;
789 c20 = c20 * (half8)ALPHA;
790 c30 = c30 * (half8)ALPHA;
791
792 /* Store 4x8 block */
793 vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
794 vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
795 vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
796 vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
797}
798
Anthony Barbierac69aa12017-07-03 17:39:37 +0100799#ifdef FIXED_POINT_POSITION
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100800/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
801 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
802 *
803 * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DWIDTH_MATRIX_B -DALPHA and -DFIXED_POINT_POSITION
804 *
805 * @note: ALPHA must be passed in 8 bit fixed point format
806 *
807 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
808 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
809 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
810 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
811 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
812 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
813 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
814 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
815 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
816 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
817 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
818 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
819 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
820 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
821 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
822 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
823 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
824 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
825 */
826__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
827 IMAGE_DECLARATION(src1),
828 IMAGE_DECLARATION(dst))
829{
830 /* src_addr.s0 = address of matrix A */
831 /* src_addr.s1 = address of matrix B */
832
833 /* Compute address for matrix A and B */
834 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
835 (src1_stride_y));
836
837 /* Add offset_first_element_in_bytes */
838 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
839
840 /* Compute end row address for matrix B */
841 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
842
843 /* Reset accumulators */
844 short8 c00 = 0.0f;
845 short8 c10 = 0.0f;
846 short8 c20 = 0.0f;
847 short8 c30 = 0.0f;
848 short8 c01 = 0.0f;
849 short8 c11 = 0.0f;
850 short8 c21 = 0.0f;
851 short8 c31 = 0.0f;
852
853 /* This for loop performs 1 accumulation for each iteration */
854 for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
855 {
856 /* Load values from matrix A (interleaved) and matrix B (transposed) */
857 char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
858 char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
859
860 c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
861 c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
862 c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
863 c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
864
865 c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
866 c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
867 c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
868 c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
869 }
870
871 /* Compute destination address */
872 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
873
874 /* Multiply by the weight of matrix product */
875 char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
876 char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
877 char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
878 char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
879
880 c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
881 c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
882 c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
883 c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
884
885 /* Store 16x4 block */
886 vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
887 vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
888 vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
889 vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
890}
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100891
892/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
893 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
894 *
895 * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DWIDTH_MATRIX_B -DALPHA and -DFIXED_POINT_POSITION
896 *
897 * @note: ALPHA must be passed in 16 bit fixed point format
898 *
899 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
900 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
901 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
902 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
903 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
904 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
905 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
906 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
907 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
908 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
909 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
910 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
911 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
912 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
913 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
914 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
915 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
916 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
917 */
918__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
919 IMAGE_DECLARATION(src1),
920 IMAGE_DECLARATION(dst))
921{
922 /* src_addr.s0 = address of matrix A */
923 /* src_addr.s1 = address of matrix B */
924
925 /* Compute address for matrix A and B */
926 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
927 (src1_stride_y));
928
929 /* Add offset_first_element_in_bytes */
930 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
931
932 /* Divide by 2 in order to get the src_addr in unit of short */
933 src_addr = src_addr >> 1;
934
935 /* Compute end row address for matrix B */
936 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
937
938 /* Reset accumulators */
939 int8 c00 = 0.0f;
940 int8 c10 = 0.0f;
941 int8 c20 = 0.0f;
942 int8 c30 = 0.0f;
943
944 /* This for loop performs 1 accumulation for each iteration */
945 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
946 {
947 /* Load values from matrix A (interleaved) and matrix B (transposed) */
948 short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
949 short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
950
951 c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
952 c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
953 c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
954 c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
955 }
956
957 /* Compute destination address */
958 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
959
960 /* Multiply by the weight of matrix product */
961 short8 c00_qs16 = convert_short8_sat(c00);
962 short8 c10_qs16 = convert_short8_sat(c10);
963 short8 c20_qs16 = convert_short8_sat(c20);
964 short8 c30_qs16 = convert_short8_sat(c30);
965
966 c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
967 c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
968 c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
969 c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
970
971 /* Store 8x4 block */
972 vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
973 vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
974 vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
975 vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
976}
977#endif // defined(FIXED_POINT_POSITION)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100978
Anthony Barbierac69aa12017-07-03 17:39:37 +0100979#ifdef WIDTH_VECTOR_A
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100980/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
981 *
982 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
983 *
984 * @attention The input vector A and matrix B must not be reshaped
985 *
986 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
987 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
988 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
989 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
990 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
991 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100992 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100993 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
994 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
995 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
996 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
997 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100998 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100999 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1000 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1001 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1002 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1003 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1004 */
1005__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
1006 IMAGE_DECLARATION(src1),
1007 IMAGE_DECLARATION(dst))
1008{
1009 int idx = get_global_id(0) * 4;
1010
1011 /* Compute the address for the vector A and matrix B */
1012 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1013 src_addr.s1 += idx * sizeof(float);
1014
1015 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
1016
1017 float4 acc = 0.0f;
1018
1019 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
1020 {
1021 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
1022 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1023 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
1024
1025 acc += b0 * (float4)a0.s0;
1026 acc += b1 * (float4)a0.s1;
1027 }
1028
1029 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
1030 {
1031 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
1032 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1033
1034 acc += b0 * (float4)a0;
1035 }
1036
1037 /* Compute destination address */
1038 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1039
1040 /* Multiply by the weight of vector-matrix product */
1041 acc = acc * (float4)ALPHA;
1042
1043 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
1044}
1045
1046/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
1047 *
1048 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
1049 *
1050 * @attention The input vector A and matrix B must not be reshaped
1051 *
1052 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
1053 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1054 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1055 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1056 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1057 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001058 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001059 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1060 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1061 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1062 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1063 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001064 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001065 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1066 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1067 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1068 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1069 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1070 */
1071__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
1072 IMAGE_DECLARATION(src1),
1073 IMAGE_DECLARATION(dst))
1074{
1075 int idx = get_global_id(0) * 8;
1076
1077 /* Compute the address for the vector A and matrix B */
1078 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1079 src_addr.s1 += idx * sizeof(half);
1080
1081 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
1082
1083 half8 acc = 0.0f;
1084
1085 for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
1086 {
1087 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
1088 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1089 half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
1090 half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
1091 half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
1092
1093 acc += b0 * (half8)a0.s0;
1094 acc += b1 * (half8)a0.s1;
1095 acc += b2 * (half8)a0.s2;
1096 acc += b3 * (half8)a0.s3;
1097 }
1098
1099 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
1100 {
1101 half a0 = *((__global half *)(src0_ptr + src_addr.s0));
1102 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
1103
1104 acc += b0 * (half8)a0;
1105 }
1106
1107 /* Compute destination address */
1108 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1109
1110 /* Multiply by the weight of vector-matrix product */
1111 acc = acc * (half8)ALPHA;
1112
1113 vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
1114}
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001115
Anthony Barbierac69aa12017-07-03 17:39:37 +01001116#ifdef FIXED_POINT_POSITION
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001117/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) in 8 bit fixed point
1118 *
1119 * @attention The width of vector A, the width of matrix B, the alpha's value and the fixed point position need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B, -DALPHA and -DFIXED_POINT_POSITION
1120 *
1121 * @attention The input vector A and matrix B must not be reshaped
1122 *
1123 * @note: ALPHA must be passed in 8 bit fixed point format
1124 *
1125 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
1126 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1127 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1128 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1129 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1130 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1131 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1132 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1133 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1134 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1135 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1136 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1137 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1138 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1139 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1140 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1141 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1142 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1143 */
1144__kernel void gemm_vm_qs8(IMAGE_DECLARATION(src0),
1145 IMAGE_DECLARATION(src1),
1146 IMAGE_DECLARATION(dst))
1147{
1148 int idx = get_global_id(0) * 16;
1149
1150 /* Compute the address for the vector A and matrix B */
1151 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1152 src_addr.s1 += idx;
1153
1154 int end_row_vec_a = src_addr.s0 + WIDTH_VECTOR_A;
1155
1156 short8 acc0 = 0;
1157 short8 acc1 = 0;
1158
1159 /* This for loop performs 4 accumulations per iteration */
1160 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
1161 {
1162 char4 a0 = vload4(0, (__global char *)(src0_ptr + src_addr.s0));
1163 char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1164 char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
1165 char16 b2 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
1166 char16 b3 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
1167
1168 acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
1169 acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
1170 acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s2, b2.s01234567, FIXED_POINT_POSITION);
1171 acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s3, b3.s01234567, FIXED_POINT_POSITION);
1172
1173 acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1174 acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
1175 acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s2, b2.s89ABCDEF, FIXED_POINT_POSITION);
1176 acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s3, b3.s89ABCDEF, FIXED_POINT_POSITION);
1177 }
1178
1179 /* Left-over accumulations */
1180 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
1181 {
1182 char a0 = *((__global char *)(src0_ptr + src_addr.s0));
1183 char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
1184
1185 acc0 = mlal_sat_qs8x8(acc0, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
1186 acc1 = mlal_sat_qs8x8(acc1, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1187 }
1188
1189 /* Compute destination address */
1190 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1191
1192 /* Multiply by the weight of matrix product */
1193 char16 acc_qs8 = convert_char16_sat((short16)(acc0, acc1));
1194
1195 acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
1196
1197 /* Store 16 values */
1198 vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
1199}
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001200
1201/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) in 16 bit fixed point
1202 *
1203 * @attention The width of vector A, the width of matrix B, the alpha's value and the fixed point position need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B, -DALPHA and -DFIXED_POINT_POSITION
1204 *
1205 * @attention The input vector A and matrix B must not be reshaped
1206 *
1207 * @note: ALPHA must be passed in 16 bit fixed point format
1208 *
1209 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
1210 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1211 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1212 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1213 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1214 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1215 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1216 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1217 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1218 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1219 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1220 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1221 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1222 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1223 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1224 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1225 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1226 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1227 */
1228__kernel void gemm_vm_qs16(IMAGE_DECLARATION(src0),
1229 IMAGE_DECLARATION(src1),
1230 IMAGE_DECLARATION(dst))
1231{
1232 int idx = get_global_id(0) * 8;
1233
1234 /* Compute the address for the vector A and matrix B */
1235 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1236 src_addr.s1 += idx * sizeof(short);
1237
1238 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(short));
1239
1240 /* Reset accumulator */
1241 int8 acc0 = 0;
1242
1243 /* This for loop performs 4 accumulations per iteration */
1244 for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(short)); src_addr += (int2)(4 * sizeof(short), 4 * src1_stride_y))
1245 {
1246 short4 a0 = vload4(0, (__global short *)(src0_ptr + src_addr.s0));
1247 short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1248 short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
1249 short8 b2 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
1250 short8 b3 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
1251
1252 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
1253 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
1254 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s2, b2, FIXED_POINT_POSITION);
1255 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s3, b3, FIXED_POINT_POSITION);
1256 }
1257
1258 /* Left-over accumulations */
1259 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
1260 {
1261 short a0 = *((__global short *)(src0_ptr + src_addr.s0));
1262 short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
1263
1264 acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
1265 }
1266
1267 /* Compute destination address */
1268 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1269
1270 /* Multiply by the weight of matrix product */
1271 short8 acc_qs16 = convert_short8_sat(acc0);
1272
1273 acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
1274
1275 /* Store 8 values */
1276 vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
1277}
1278#endif /* defined(FIXED_POINT_POSITION) */
1279#endif /* defined(WIDTH_VECTOR_A) */
1280#endif /* defined(WIDTH_MATRIX_B) && defined(ALPHA) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001281
Anthony Barbierac69aa12017-07-03 17:39:37 +01001282#ifdef BETA
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001283/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
1284 *
1285 * @attention The beta's value need to be passed at compile time using -DBETA
1286 *
1287 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
1288 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1289 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1290 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1291 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1292 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001293 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1295 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1296 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1297 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1298 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1299 */
1300__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
1301 IMAGE_DECLARATION(dst))
1302{
1303 /* Compute source and destination addresses */
1304 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1305 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1306
1307 /* Load values from A x B */
1308 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
1309
1310 /* Load values from Matrix C */
1311 float4 c = vload4(0, (__global float *)src.ptr);
1312
1313 /* Computes alpha * axb + beta * c */
1314 float4 out = alpha_ab + (float4)BETA * c;
1315
1316 /* Store final result in axb matrix */
1317 vstore4(out, 0, (__global float *)dst.ptr);
1318}
1319
1320/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
1321 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001322 * @attention The beta's value need to be passed at compile time using -DBETA
1323 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001324 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
1325 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1326 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1327 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1328 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1329 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001330 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001331 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1332 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1333 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1334 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1335 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1336 */
1337__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
1338 IMAGE_DECLARATION(dst))
1339{
1340 /* Compute source and destination addresses */
1341 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1342 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1343
1344 /* Load values from A x B */
1345 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
1346
1347 /* Load values from Matrix C */
1348 half8 c = vload8(0, (__global half *)src.ptr);
1349
1350 /* Computes alpha * axb + beta * c */
1351 half8 out = alpha_ab + (half8)BETA * c;
1352
1353 /* Store final result in axb matrix */
1354 vstore8(out, 0, (__global half *)dst.ptr);
1355}
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001356
Anthony Barbierac69aa12017-07-03 17:39:37 +01001357#ifdef FIXED_POINT_POSITION
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001358/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
1359 *
1360 * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
1361 *
1362 * @note: BETA must be passed in 8 bit fixed point format
1363 *
1364 * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8
1365 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1366 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1367 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1368 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1369 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
1370 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
1371 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1372 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1373 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1374 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1375 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1376 */
1377__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
1378 IMAGE_DECLARATION(dst))
1379{
1380 /* Compute source and destination addresses */
1381 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1382 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1383
1384 /* Load values from A x B */
1385 char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
1386
1387 /* Load values from Matrix C */
1388 char16 c = vload16(0, (__global char *)src.ptr);
1389
1390 /* Computes alpha * axb + beta * c */
1391 char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
1392
1393 /* Store final result in axb matrix */
1394 vstore16(out, 0, (__global char *)dst.ptr);
1395}
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001396
1397/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
1398 *
1399 * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
1400 *
1401 * @note: BETA must be passed in 16 bit fixed point format
1402 *
1403 * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16
1404 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1405 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1406 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1407 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1408 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
1409 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
1410 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1411 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1412 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1413 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1414 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1415 */
1416__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
1417 IMAGE_DECLARATION(dst))
1418{
1419 /* Compute source and destination addresses */
1420 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1421 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1422
1423 /* Load values from A x B */
1424 short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
1425
1426 /* Load values from Matrix C */
1427 short8 c = vload8(0, (__global short *)src.ptr);
1428
1429 /* Computes alpha * axb + beta * c */
1430 short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
1431
1432 /* Store final result in axb matrix */
1433 vstore8(out, 0, (__global short *)dst.ptr);
1434}
1435#endif /* defined(FIXED_POINT_POSITION) */
1436#endif /* defined(BETA) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001437
Anthony Barbierac69aa12017-07-03 17:39:37 +01001438#ifdef WIDTH_VECTOR_A
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001439/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
1440 *
1441 * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
1442 *
1443 * @attention The input A and matrix B must not be reshaped
1444 *
1445 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
1446 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1447 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1448 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1449 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1450 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001451 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001452 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1453 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1454 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1455 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1456 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1457 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1458 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001459 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001460 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1461 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1462 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1463 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1464 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1465 */
1466__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
1467 TENSOR3D_DECLARATION(src1),
1468 IMAGE_DECLARATION(dst))
1469{
1470 int idx = get_global_id(0) * 4;
1471 int idy = get_global_id(1);
1472
1473 /* Compute the address for the vector A and matrix B */
1474 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
1475 src_addr.s1 += idx * sizeof(float);
1476
1477 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
1478
1479 float4 acc = 0.0f;
1480
1481 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
1482 {
1483 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
1484 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1485 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
1486
1487 acc += b0 * (float4)a0.s0;
1488 acc += b1 * (float4)a0.s1;
1489 }
1490
1491 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
1492 {
1493 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
1494 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1495
1496 acc += b0 * (float4)a0;
1497 }
1498
1499 /* Compute destination address */
1500 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1501
1502 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
1503}
Anthony Barbierac69aa12017-07-03 17:39:37 +01001504#endif /* WIDTH_VECTOR_A */