blob: 15111ed35202563f362ee0a4bbe7b49dce82f643 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Gian Marco Iodice368da832017-07-03 12:33:49 +010026#ifdef FIXED_POINT_POSITION
27#include "fixed_point.h"
28#endif // FIXED_POINT_POSITION
29
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
31 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010032 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
34 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
35 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
36 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
37 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010038 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
40 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
41 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
42 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
43 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
44 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010045__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
46 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047{
48 uint x = get_global_id(0);
49 uint y = get_global_id(1);
50
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +010051 // Compute address for Matrix B - source
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052 Image src = CONVERT_TO_IMAGE_STRUCT(src);
53
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +010054 // Compute address for Matrix B transposed - destination. X and Y are swapped
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
56
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010057 uint4 b0 = vload4(0, (__global uint *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010059 vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010060}
61
62/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
63 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010064 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
Anthony Barbier6ff3b192017-09-04 18:44:23 +010065 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
66 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
67 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
68 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
69 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010070 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
72 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
73 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
74 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
75 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
76 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010077__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
78 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 uint x = get_global_id(0);
81 uint y = get_global_id(1);
82
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +000083 // Compute address for Matrix B - source
Anthony Barbier6ff3b192017-09-04 18:44:23 +010084 Image src = CONVERT_TO_IMAGE_STRUCT(src);
85
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +000086 // Compute address for Matrix B transposed - destination. X and Y are swapped
Anthony Barbier6ff3b192017-09-04 18:44:23 +010087 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
88
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010089 ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010090
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +010091 vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010092}
93
94/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
95 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010096 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
Anthony Barbier6ff3b192017-09-04 18:44:23 +010097 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
98 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
99 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
100 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
101 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100102 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
104 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
105 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
106 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
107 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
108 */
Gian Marco Iodice9f89bae2017-06-22 12:09:49 +0100109__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
110 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100111{
112 uint x = get_global_id(0);
113 uint y = get_global_id(1);
114
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000115 // Compute address for Matrix B - source
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100116 Image src = CONVERT_TO_IMAGE_STRUCT(src);
117
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000118 // Compute address for Matrix B transposed - destination. X and Y are swapped
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100119 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
120
121 uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
122
123 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
124}
125
126/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
127 *
128 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
129 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
130 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
131 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
132 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
133 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100134 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100135 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
136 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
137 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
138 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
139 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
140 */
141__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
142 IMAGE_DECLARATION(dst))
143{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000144 // Compute source and destination addresses
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100145 Image src = CONVERT_TO_IMAGE_STRUCT(src);
146 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
147
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000148 // Load values from Matrix A
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100149 uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
150 uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
151 uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
152 uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100153
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100154 uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
155 vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100156
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100157 val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
158 vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100160 val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
161 vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100162
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100163 val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
164 vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100165}
166
167/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
168 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100169 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100170 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
171 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
172 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
173 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
174 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100175 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100176 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
177 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
178 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
179 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
180 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
181 */
182__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
183 IMAGE_DECLARATION(dst))
184{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000185 // Compute source and destination addresses
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186 Image src = CONVERT_TO_IMAGE_STRUCT(src);
187 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
188
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000189 // Load values from Matrix A
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100190 ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
191 ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
192 ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
193 ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100195 ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
196 vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100198 val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
199 vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100200
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100201 val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
202 vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203
Gian Marco Iodiceb93f5de2017-07-05 15:48:39 +0100204 val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
205 vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100206}
207
208/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
209 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100210 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100211 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
212 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
213 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
214 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
215 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100216 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
218 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
219 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
220 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
221 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
222 */
223__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
224 IMAGE_DECLARATION(dst))
225{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000226 // Compute source and destination addresses
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100227 Image src = CONVERT_TO_IMAGE_STRUCT(src);
228 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
229
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000230 // Load values from Matrix A
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100231 uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
232 uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
233 uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
234 uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
235
236 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
237 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
238 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
239
240 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
241 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
242 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
243
244 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
245 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
246 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
247
248 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
249 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
250 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
251}
252
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000253#if defined(COLS_B)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100254/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100255 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100256 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000257 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100258 *
259 * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
260 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
261 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
262 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
263 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
264 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100265 * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100266 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
267 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
268 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
269 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
270 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100271 * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100272 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
273 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
274 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
275 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
276 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
277 * @param[in] a_offset Offset to be added to each element of the matrix A
278 * @param[in] b_offset Offset to be added to each element of the matrix B.
279 * @param[in] c_offset Offset to be added to each element of the matrix C.
280 * @param[in] c_mult_int Multiplied with each element of the matrix C.
281 * @param[in] shift Number of bits to shift right the result.
282 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100283__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
284 IMAGE_DECLARATION(src1),
285 IMAGE_DECLARATION(dst),
286 int a_offset,
287 int b_offset,
288 int c_offset,
289 int c_mult_int,
290 int shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100291{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000292 // src_addr.s0 = address of matrix A
293 // src_addr.s1 = address of matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100294
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000295 // Compute address for matrix A and B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100296 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
297 (src1_stride_y));
298
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000299 // Add offset_first_element_in_bytes
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100300 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
301
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000302 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100303 int end_row_mtx_b = src_addr.s1 + COLS_B;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100304
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000305 // Reset accumulators
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100306 int16 c00 = 0.0f;
307 int16 c10 = 0.0f;
308 int16 c20 = 0.0f;
309 int16 c30 = 0.0f;
310
311 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
312 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000313 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100314 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
315 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
316
317 c00 += (int16)a0.s0 * b0;
318 c10 += (int16)a0.s1 * b0;
319 c20 += (int16)a0.s2 * b0;
320 c30 += (int16)a0.s3 * b0;
321
322 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
323
324 c00 += (int16)a0.s4 * b1;
325 c10 += (int16)a0.s5 * b1;
326 c20 += (int16)a0.s6 * b1;
327 c30 += (int16)a0.s7 * b1;
328 }
329
330 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
331 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000332 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100333 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
334 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
335
336 c00 += (int16)a0.s0 * b0;
337 c10 += (int16)a0.s1 * b0;
338 c20 += (int16)a0.s2 * b0;
339 c30 += (int16)a0.s3 * b0;
340 }
341
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000342 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100343 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
344
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000345 // Multiply by the weight of matrix product
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100346 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
347 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
348 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
349 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
350
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000351 // Store 4x16 block
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100352 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
353 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
354 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
355 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
356}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100357
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100358/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100359 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100360 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000361 * @attention The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100362 *
363 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
364 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
365 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
366 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
367 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
368 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100369 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100370 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
371 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
372 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
373 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
374 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100375 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100376 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
377 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
378 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
379 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
380 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
381 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100382__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
383 IMAGE_DECLARATION(src1),
384 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100385{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000386 // src_addr.s0 = address of matrix A
387 // src_addr.s1 = address of matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100388
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000389 // Compute address for matrix A and B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100390 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
391 (src1_stride_y));
392
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000393 // Add offset_first_element_in_bytes
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100394 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
395
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000396 // Divide by 4 in order to get the src_addr in unit of float
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100397 src_addr = src_addr >> 2;
398
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000399 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100400 int end_row_mtx_b = src_addr.s1 + COLS_B;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100401
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000402 // Reset accumulators
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403 float4 c00 = 0.0f;
404 float4 c10 = 0.0f;
405 float4 c20 = 0.0f;
406 float4 c30 = 0.0f;
407
408 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
409 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000410 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100411 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
412 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
413
414 c00 += (float4)a0.s0 * b0;
415 c10 += (float4)a0.s1 * b0;
416 c20 += (float4)a0.s2 * b0;
417 c30 += (float4)a0.s3 * b0;
418
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000419 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100420 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
421 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
422
423 c00 += (float4)a0.s0 * b0;
424 c10 += (float4)a0.s1 * b0;
425 c20 += (float4)a0.s2 * b0;
426 c30 += (float4)a0.s3 * b0;
427 }
428
429 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
430 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000431 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100432 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
433 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
434
435 c00 += (float4)a0.s0 * b0;
436 c10 += (float4)a0.s1 * b0;
437 c20 += (float4)a0.s2 * b0;
438 c30 += (float4)a0.s3 * b0;
439 }
440
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000441 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100442 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
443
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000444#if defined(ALPHA)
445 // Multiply by the weight of matrix product
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100446 c00 = c00 * (float4)ALPHA;
447 c10 = c10 * (float4)ALPHA;
448 c20 = c20 * (float4)ALPHA;
449 c30 = c30 * (float4)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000450#endif // defined(ALPHA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100451
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000452 // Store 4x4 block
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100453 vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
454 vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
455 vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
456 vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
457}
458
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000459/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100460 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100461 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000462 * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100463 *
464 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
465 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
466 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
467 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
468 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
469 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100470 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100471 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
472 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
473 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
474 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
475 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100476 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100477 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
478 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
479 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
480 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
481 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
482 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100483__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
484 IMAGE_DECLARATION(src1),
485 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100486{
487 // src_addr_a = address of matrix A
488 // src_addr_b = address of matrix B
489 __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
490 __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
491
492 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100493 __global float *src_end_addr_b = src_addr_b + COLS_B;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100494
495 // Reset accumulators
496 float c00 = 0.0f;
497 float c01 = 0.0f;
498 float c02 = 0.0f;
499 float c03 = 0.0f;
500 float c10 = 0.0f;
501 float c11 = 0.0f;
502 float c12 = 0.0f;
503 float c13 = 0.0f;
504 float c20 = 0.0f;
505 float c21 = 0.0f;
506 float c22 = 0.0f;
507 float c23 = 0.0f;
508 float c30 = 0.0f;
509 float c31 = 0.0f;
510 float c32 = 0.0f;
511 float c33 = 0.0f;
512
513 for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
514 {
515 // Load values from matrix A (interleaved) and matrix B (transposed)
516 float4 a0 = vload4(0, src_addr_a);
517 float4 b0 = vload4(0, src_addr_b);
518
519 c00 = fma(a0.s0, b0.s0, c00);
520 c01 = fma(a0.s0, b0.s1, c01);
521 c02 = fma(a0.s0, b0.s2, c02);
522 c03 = fma(a0.s0, b0.s3, c03);
523
524 c10 = fma(a0.s1, b0.s0, c10);
525 c11 = fma(a0.s1, b0.s1, c11);
526 c12 = fma(a0.s1, b0.s2, c12);
527 c13 = fma(a0.s1, b0.s3, c13);
528
529 c20 = fma(a0.s2, b0.s0, c20);
530 c21 = fma(a0.s2, b0.s1, c21);
531 c22 = fma(a0.s2, b0.s2, c22);
532 c23 = fma(a0.s2, b0.s3, c23);
533
534 c30 = fma(a0.s3, b0.s0, c30);
535 c31 = fma(a0.s3, b0.s1, c31);
536 c32 = fma(a0.s3, b0.s2, c32);
537 c33 = fma(a0.s3, b0.s3, c33);
538
539 // Load values from matrix A (interleaved) and matrix B (transposed)
540 a0 = vload4(0, src_addr_a + 4);
541 b0 = vload4(0, src_addr_b + 4);
542
543 c00 = fma(a0.s0, b0.s0, c00);
544 c01 = fma(a0.s0, b0.s1, c01);
545 c02 = fma(a0.s0, b0.s2, c02);
546 c03 = fma(a0.s0, b0.s3, c03);
547
548 c10 = fma(a0.s1, b0.s0, c10);
549 c11 = fma(a0.s1, b0.s1, c11);
550 c12 = fma(a0.s1, b0.s2, c12);
551 c13 = fma(a0.s1, b0.s3, c13);
552
553 c20 = fma(a0.s2, b0.s0, c20);
554 c21 = fma(a0.s2, b0.s1, c21);
555 c22 = fma(a0.s2, b0.s2, c22);
556 c23 = fma(a0.s2, b0.s3, c23);
557
558 c30 = fma(a0.s3, b0.s0, c30);
559 c31 = fma(a0.s3, b0.s1, c31);
560 c32 = fma(a0.s3, b0.s2, c32);
561 c33 = fma(a0.s3, b0.s3, c33);
562
563 // Load values from matrix A (interleaved) and matrix B (transposed)
564 a0 = vload4(0, src_addr_a + 8);
565 b0 = vload4(0, src_addr_b + 8);
566
567 c00 = fma(a0.s0, b0.s0, c00);
568 c01 = fma(a0.s0, b0.s1, c01);
569 c02 = fma(a0.s0, b0.s2, c02);
570 c03 = fma(a0.s0, b0.s3, c03);
571
572 c10 = fma(a0.s1, b0.s0, c10);
573 c11 = fma(a0.s1, b0.s1, c11);
574 c12 = fma(a0.s1, b0.s2, c12);
575 c13 = fma(a0.s1, b0.s3, c13);
576
577 c20 = fma(a0.s2, b0.s0, c20);
578 c21 = fma(a0.s2, b0.s1, c21);
579 c22 = fma(a0.s2, b0.s2, c22);
580 c23 = fma(a0.s2, b0.s3, c23);
581
582 c30 = fma(a0.s3, b0.s0, c30);
583 c31 = fma(a0.s3, b0.s1, c31);
584 c32 = fma(a0.s3, b0.s2, c32);
585 c33 = fma(a0.s3, b0.s3, c33);
586
587 // Load values from matrix A (interleaved) and matrix B (transposed)
588 a0 = vload4(0, src_addr_a + 12);
589 b0 = vload4(0, src_addr_b + 12);
590
591 c00 = fma(a0.s0, b0.s0, c00);
592 c01 = fma(a0.s0, b0.s1, c01);
593 c02 = fma(a0.s0, b0.s2, c02);
594 c03 = fma(a0.s0, b0.s3, c03);
595
596 c10 = fma(a0.s1, b0.s0, c10);
597 c11 = fma(a0.s1, b0.s1, c11);
598 c12 = fma(a0.s1, b0.s2, c12);
599 c13 = fma(a0.s1, b0.s3, c13);
600
601 c20 = fma(a0.s2, b0.s0, c20);
602 c21 = fma(a0.s2, b0.s1, c21);
603 c22 = fma(a0.s2, b0.s2, c22);
604 c23 = fma(a0.s2, b0.s3, c23);
605
606 c30 = fma(a0.s3, b0.s0, c30);
607 c31 = fma(a0.s3, b0.s1, c31);
608 c32 = fma(a0.s3, b0.s2, c32);
609 c33 = fma(a0.s3, b0.s3, c33);
610 }
611
612 for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
613 {
614 // Load values from matrix A (interleaved) and matrix B (transposed)
615 float4 a0 = vload4(0, src_addr_a);
616 float4 b0 = vload4(0, src_addr_b);
617
618 c00 = fma(a0.s0, b0.s0, c00);
619 c01 = fma(a0.s0, b0.s1, c01);
620 c02 = fma(a0.s0, b0.s2, c02);
621 c03 = fma(a0.s0, b0.s3, c03);
622
623 c10 = fma(a0.s1, b0.s0, c10);
624 c11 = fma(a0.s1, b0.s1, c11);
625 c12 = fma(a0.s1, b0.s2, c12);
626 c13 = fma(a0.s1, b0.s3, c13);
627
628 c20 = fma(a0.s2, b0.s0, c20);
629 c21 = fma(a0.s2, b0.s1, c21);
630 c22 = fma(a0.s2, b0.s2, c22);
631 c23 = fma(a0.s2, b0.s3, c23);
632
633 c30 = fma(a0.s3, b0.s0, c30);
634 c31 = fma(a0.s3, b0.s1, c31);
635 c32 = fma(a0.s3, b0.s2, c32);
636 c33 = fma(a0.s3, b0.s3, c33);
637 }
638
639 // Compute destination address
640 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
641
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000642#if defined(ALPHA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100643 // Multiply by the weight of matrix product
644 c00 = c00 * ALPHA;
645 c01 = c01 * ALPHA;
646 c02 = c02 * ALPHA;
647 c03 = c03 * ALPHA;
648 c10 = c10 * ALPHA;
649 c11 = c11 * ALPHA;
650 c12 = c12 * ALPHA;
651 c13 = c13 * ALPHA;
652 c20 = c20 * ALPHA;
653 c21 = c21 * ALPHA;
654 c22 = c22 * ALPHA;
655 c23 = c23 * ALPHA;
656 c30 = c30 * ALPHA;
657 c31 = c31 * ALPHA;
658 c32 = c32 * ALPHA;
659 c33 = c33 * ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000660#endif // defined(ALPHA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100661
662 barrier(CLK_GLOBAL_MEM_FENCE);
663
664 // Store 4x4 block
665 vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
666 vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
667 vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
668 vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
669}
670
Matthew Bentham6f31f8c2017-10-27 11:50:06 +0100671#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100672/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100673 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100674 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000675 * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100676 *
677 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
678 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
679 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
680 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
681 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
682 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100683 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100684 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
685 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
686 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
687 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
688 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100689 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100690 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
691 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
692 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
693 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
694 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
695 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100696__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
697 IMAGE_DECLARATION(src1),
698 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100699{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000700 // src_addr.s0 = address of matrix A
701 // src_addr.s1 = address of matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100702
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000703 // Compute address for matrix A and B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100704 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
705 (src1_stride_y));
706
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000707 // Add offset_first_element_in_bytes
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100708 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
709
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000710 // Divide by 2 in order to get the src_addr in unit of half
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100711 src_addr = src_addr >> 1;
712
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000713 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100714 int end_row_mtx_b = src_addr.s1 + COLS_B;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100715
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000716 // Reset accumulators
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100717 half8 c00 = 0.0f;
718 half8 c10 = 0.0f;
719 half8 c20 = 0.0f;
720 half8 c30 = 0.0f;
721
Moritz Pflanzere49e2662017-07-21 15:55:28 +0100722 for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000724 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100725 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
726 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
727
728 c00 += (half8)a0.s0 * b0;
729 c10 += (half8)a0.s1 * b0;
730 c20 += (half8)a0.s2 * b0;
731 c30 += (half8)a0.s3 * b0;
732
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000733 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100734 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
735 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
736
737 c00 += (half8)a0.s0 * b0;
738 c10 += (half8)a0.s1 * b0;
739 c20 += (half8)a0.s2 * b0;
740 c30 += (half8)a0.s3 * b0;
741 }
742
743 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
744 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000745 // Load values from matrix A (interleaved) and matrix B (transposed)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100746 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
747 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
748
749 c00 += (half8)a0.s0 * b0;
750 c10 += (half8)a0.s1 * b0;
751 c20 += (half8)a0.s2 * b0;
752 c30 += (half8)a0.s3 * b0;
753 }
754
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000755 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100756 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
757
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000758#if defined(ALPHA)
759 // Multiply by the weight of matrix product
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100760 c00 = c00 * (half8)ALPHA;
761 c10 = c10 * (half8)ALPHA;
762 c20 = c20 * (half8)ALPHA;
763 c30 = c30 * (half8)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000764#endif // defined(ALPHA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000766 // Store 4x8 block
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100767 vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
768 vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
769 vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
770 vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
771}
Matthew Bentham6f31f8c2017-10-27 11:50:06 +0100772#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100773
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000774#if defined(FIXED_POINT_POSITION)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100775/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
776 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
777 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000778 * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100779 *
780 * @note: ALPHA must be passed in 8 bit fixed point format
781 *
782 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
783 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
784 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
785 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
786 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
787 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
788 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
789 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
790 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
791 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
792 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
793 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
794 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
795 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
796 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
797 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
798 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
799 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
800 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100801__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
802 IMAGE_DECLARATION(src1),
803 IMAGE_DECLARATION(dst))
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100804{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000805 // src_addr.s0 = address of matrix A
806 // src_addr.s1 = address of matrix B
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100807
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000808 // Compute address for matrix A and B
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100809 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
810 (src1_stride_y));
811
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000812 // Add offset_first_element_in_bytes
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100813 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
814
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000815 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100816 int end_row_mtx_b = src_addr.s1 + COLS_B;
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100817
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000818 // Reset accumulators
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100819 short8 c00 = 0.0f;
820 short8 c10 = 0.0f;
821 short8 c20 = 0.0f;
822 short8 c30 = 0.0f;
823 short8 c01 = 0.0f;
824 short8 c11 = 0.0f;
825 short8 c21 = 0.0f;
826 short8 c31 = 0.0f;
827
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000828 // This for loop performs 1 accumulation for each iteration
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100829 for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
830 {
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000831 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100832 char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
833 char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
834
835 c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
836 c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
837 c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
838 c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
839
840 c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
841 c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
842 c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
843 c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
844 }
845
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000846 // Compute destination address
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100847 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
848
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000849 // Multiply by the weight of matrix product
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100850 char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
851 char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
852 char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
853 char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
854
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000855#if defined(ALPHA)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100856 c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
857 c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
858 c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
859 c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000860#endif // defined(ALPHA)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100861
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000862 // Store 16x4 block
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100863 vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
864 vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
865 vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
866 vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
867}
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100868
869/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
870 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
871 *
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000872 * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100873 *
874 * @note: ALPHA must be passed in 16 bit fixed point format
875 *
876 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
877 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
878 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
879 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
880 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
881 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
882 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
883 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
884 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
885 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
886 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
887 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
888 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
889 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
890 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
891 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
892 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
893 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
894 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100895__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
896 IMAGE_DECLARATION(src1),
897 IMAGE_DECLARATION(dst))
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100898{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000899 // src_addr.s0 = address of matrix A
900 // src_addr.s1 = address of matrix B
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100901
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000902 // Compute address for matrix A and B
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100903 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
904 (src1_stride_y));
905
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000906 // Add offset_first_element_in_bytes
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100907 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
908
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000909 // Divide by 2 in order to get the src_addr in unit of short
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100910 src_addr = src_addr >> 1;
911
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000912 // Compute end row address for matrix B
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100913 int end_row_mtx_b = src_addr.s1 + COLS_B;
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100914
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000915 // Reset accumulators
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100916 int8 c00 = 0.0f;
917 int8 c10 = 0.0f;
918 int8 c20 = 0.0f;
919 int8 c30 = 0.0f;
920
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000921 // This for loop performs 1 accumulation for each iteration
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100922 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
923 {
924 /* Load values from matrix A (interleaved) and matrix B (transposed) */
925 short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
926 short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
927
928 c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
929 c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
930 c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
931 c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
932 }
933
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000934 // Compute destination address
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100935 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
936
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000937 // Multiply by the weight of matrix product
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100938 short8 c00_qs16 = convert_short8_sat(c00);
939 short8 c10_qs16 = convert_short8_sat(c10);
940 short8 c20_qs16 = convert_short8_sat(c20);
941 short8 c30_qs16 = convert_short8_sat(c30);
942
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000943#if defined(ALPHA)
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100944 c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
945 c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
946 c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
947 c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000948#endif // defined(ALPHA)
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100949
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000950 // Store 8x4 block
Gian Marco Iodice8a383692017-07-03 17:41:47 +0100951 vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
952 vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
953 vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
954 vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
955}
956#endif // defined(FIXED_POINT_POSITION)
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000957#endif // defined(COLS_B)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100958
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100959#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
960#if defined(DATA_TYPE)
961#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
962/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100963 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100964 * @note This OpenCL kernel works with floating point data types (F16/F32)
965 * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
966 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +0000967 * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100968 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100969 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100970 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
971 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
972 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
973 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
974 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100975 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100976 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
977 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
978 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
979 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
980 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +0100981 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100982 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
983 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
984 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
985 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
986 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
987 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100988__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
989 IMAGE_DECLARATION(src1),
990 IMAGE_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100991{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100992 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100993
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100994 // Compute starting address for matrix A and Matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100995 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100996
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +0100997 // Update address for the matrix A
998 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100999
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001000 // Update address for the matrix B
1001 src_addr.s1 += idx * sizeof(DATA_TYPE);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001002
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001003 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
1004
1005 VECTOR_TYPE acc0 = 0.0f;
1006#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1007 VECTOR_TYPE acc1 = 0.0f;
1008#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1009#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1010 VECTOR_TYPE acc2 = 0.0f;
1011#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1012#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1013 VECTOR_TYPE acc3 = 0.0f;
1014#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1015
Georgios Pinitas96880cf2017-10-20 18:52:20 +01001016 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001017 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001018 // Load values from matrix A
1019 VEC_DATA_TYPE(DATA_TYPE, 2)
1020 a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1021#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1022 VEC_DATA_TYPE(DATA_TYPE, 2)
1023 a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1024#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1025#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1026 VEC_DATA_TYPE(DATA_TYPE, 2)
1027 a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1028#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1029#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1030 VEC_DATA_TYPE(DATA_TYPE, 2)
1031 a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1032#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1033 // Load values from matrix B
1034 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
1035 VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001036
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001037 // Accumulate
1038 acc0 += b0 * (VECTOR_TYPE)a0.s0;
1039 acc0 += b1 * (VECTOR_TYPE)a0.s1;
1040#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1041 acc1 += b0 * (VECTOR_TYPE)a1.s0;
1042 acc1 += b1 * (VECTOR_TYPE)a1.s1;
1043#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1044#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1045 acc2 += b0 * (VECTOR_TYPE)a2.s0;
1046 acc2 += b1 * (VECTOR_TYPE)a2.s1;
1047#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1048#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1049 acc3 += b0 * (VECTOR_TYPE)a3.s0;
1050 acc3 += b1 * (VECTOR_TYPE)a3.s1;
1051#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001052 }
1053
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001054 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001055 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001056 // Load values from matrix A
1057 DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1058#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1059 DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1060#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1061#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1062 DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1063#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1064#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1065 DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1066#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1067 // Load values from matrix B
1068 VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001069
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001070 // Accumulate
1071 acc0 += b0 * (VECTOR_TYPE)a0;
1072#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1073 acc1 += b0 * (VECTOR_TYPE)a1;
1074#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1075#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1076 acc2 += b0 * (VECTOR_TYPE)a2;
1077#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1078#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1079 acc3 += b0 * (VECTOR_TYPE)a3;
1080#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001081 }
1082
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001083 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001084 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1085
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001086 // Multiply by the weight of matrix-matrix product and store the result
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001087#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001088 acc0 = acc0 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001089#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001090 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1091 (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
1092#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001093#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001094 acc1 = acc1 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001095#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001096 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1097 (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
1098#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1099#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001100#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001101 acc2 = acc2 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001102#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001103 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1104 (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
1105#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1106#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001107#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001108 acc3 = acc3 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001109#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001110 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1111 (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
1112#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001113}
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001114#endif // defined(DATA_TYPE)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001115
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001116/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1117 *
1118 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
1119 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
1120 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
1121 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
1122 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
1123 *
1124 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
1125 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1126 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1127 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1128 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1129 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1130 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1131 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1132 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1133 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1134 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1135 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1136 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1137 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1138 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1139 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1140 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1141 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1142 */
1143__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
1144 IMAGE_DECLARATION(src1),
1145 IMAGE_DECLARATION(dst))
1146{
1147 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1148
1149 // Compute starting address for matrix A and matrix B
1150 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1151
1152 // Update address for matrix A
1153 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1154
1155 // Update address for matrix B
1156 src_addr.s1 += idx * sizeof(float);
1157
1158 // Address boundary for matrix A
1159 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
1160
1161 // Initialize accumulators
1162 float acc00 = 0.0f;
1163 float acc01 = 0.0f;
1164 float acc02 = 0.0f;
1165 float acc03 = 0.0f;
1166
1167#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1168 float acc10 = 0.0f;
1169 float acc11 = 0.0f;
1170 float acc12 = 0.0f;
1171 float acc13 = 0.0f;
1172#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1173
1174#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1175 float acc20 = 0.0f;
1176 float acc21 = 0.0f;
1177 float acc22 = 0.0f;
1178 float acc23 = 0.0f;
1179#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1180
1181#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1182 float acc30 = 0.0f;
1183 float acc31 = 0.0f;
1184 float acc32 = 0.0f;
1185 float acc33 = 0.0f;
1186#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1187
1188 // A and B src indices get incremented at the same time.
1189 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
1190 {
1191 // Load values from matrix A
1192 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1193#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1194 float2 a1 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1195#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1196#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1197 float2 a2 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1198#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1199#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1200 float2 a3 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1201#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1202 // Load values from matrix B
1203 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1204 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
1205
1206 // Multiply and accumulate
1207 acc00 = fma(a0.s0, b0.s0, acc00);
1208 acc00 = fma(a0.s1, b1.s0, acc00);
1209 acc01 = fma(a0.s0, b0.s1, acc01);
1210 acc01 = fma(a0.s1, b1.s1, acc01);
1211 acc02 = fma(a0.s0, b0.s2, acc02);
1212 acc02 = fma(a0.s1, b1.s2, acc02);
1213 acc03 = fma(a0.s1, b1.s3, acc03);
1214 acc03 = fma(a0.s0, b0.s3, acc03);
1215
1216#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1217 acc10 = fma(a1.s0, b0.s0, acc10);
1218 acc11 = fma(a1.s0, b0.s1, acc11);
1219 acc12 = fma(a1.s0, b0.s2, acc12);
1220 acc13 = fma(a1.s0, b0.s3, acc13);
1221
1222 acc10 = fma(a1.s1, b1.s0, acc10);
1223 acc11 = fma(a1.s1, b1.s1, acc11);
1224 acc12 = fma(a1.s1, b1.s2, acc12);
1225 acc13 = fma(a1.s1, b1.s3, acc13);
1226#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1227#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1228 acc20 = fma(a2.s0, b0.s0, acc20);
1229 acc21 = fma(a2.s0, b0.s1, acc21);
1230 acc22 = fma(a2.s0, b0.s2, acc22);
1231 acc23 = fma(a2.s0, b0.s3, acc23);
1232
1233 acc20 = fma(a2.s1, b1.s0, acc20);
1234 acc21 = fma(a2.s1, b1.s1, acc21);
1235 acc22 = fma(a2.s1, b1.s2, acc22);
1236 acc23 = fma(a2.s1, b1.s3, acc23);
1237#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1238#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1239 acc30 = fma(a3.s0, b0.s0, acc30);
1240 acc31 = fma(a3.s0, b0.s1, acc31);
1241 acc32 = fma(a3.s0, b0.s2, acc32);
1242 acc33 = fma(a3.s0, b0.s3, acc33);
1243
1244 acc30 = fma(a3.s1, b1.s0, acc30);
1245 acc31 = fma(a3.s1, b1.s1, acc31);
1246 acc32 = fma(a3.s1, b1.s2, acc32);
1247 acc33 = fma(a3.s1, b1.s3, acc33);
1248#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1249 }
1250
1251 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
1252 {
1253 // Load values from matrix A
1254 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1255#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1256 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1257#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1258#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1259 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1260#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1261#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1262 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1264 // Load values from matrix B
1265 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
1266
1267 // Multiply and accumulate
1268 acc00 = fma(a0, b0.s0, acc00);
1269 acc01 = fma(a0, b0.s1, acc01);
1270 acc02 = fma(a0, b0.s2, acc02);
1271 acc03 = fma(a0, b0.s3, acc03);
1272#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1273 acc10 = fma(a1, b0.s0, acc10);
1274 acc11 = fma(a1, b0.s1, acc11);
1275 acc12 = fma(a1, b0.s2, acc12);
1276 acc13 = fma(a1, b0.s3, acc13);
1277#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1278#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1279 acc20 = fma(a2, b0.s0, acc20);
1280 acc21 = fma(a2, b0.s1, acc21);
1281 acc22 = fma(a2, b0.s2, acc22);
1282 acc23 = fma(a2, b0.s3, acc23);
1283#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1284#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1285 acc30 = fma(a3, b0.s0, acc30);
1286 acc31 = fma(a3, b0.s1, acc31);
1287 acc32 = fma(a3, b0.s2, acc32);
1288 acc33 = fma(a3, b0.s3, acc33);
1289#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1290 }
1291
1292 // Compute destination address
1293 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1294
1295 // Multiply by the weight of matrix-matrix product and store the result
1296#if defined(ALPHA)
1297 acc00 = acc00 * ALPHA;
1298 acc01 = acc01 * ALPHA;
1299 acc02 = acc02 * ALPHA;
1300 acc03 = acc03 * ALPHA;
1301#endif // defined(ALPHA)
1302
1303 float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));
1304 vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
1305
1306#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1307#if defined(ALPHA)
1308 acc10 = acc10 * ALPHA;
1309 acc11 = acc11 * ALPHA;
1310 acc12 = acc12 * ALPHA;
1311 acc13 = acc13 * ALPHA;
1312#endif // defined(ALPHA)
1313 float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));
1314 vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
1315#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1316#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1317#if defined(ALPHA)
1318 acc20 = acc20 * ALPHA;
1319 acc21 = acc21 * ALPHA;
1320 acc22 = acc22 * ALPHA;
1321 acc23 = acc23 * ALPHA;
1322#endif // defined(ALPHA)
1323 float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));
1324 vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
1325#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1326#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1327#if defined(ALPHA)
1328 acc30 = acc30 * ALPHA;
1329 acc31 = acc31 * ALPHA;
1330 acc32 = acc32 * ALPHA;
1331 acc33 = acc33 * ALPHA;
1332#endif // defined(ALPHA)
1333 float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));
1334 vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
1335#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1336}
1337
1338/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
1339 *
1340 * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
1341 * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
1342 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
1343 * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
1344 * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
1345 * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
1346 *
1347 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
1348 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1349 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1350 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1351 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1352 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1353 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1354 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1355 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1356 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1357 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1358 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1359 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1360 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1361 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1362 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1363 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1364 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1365 */
1366__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
1367 IMAGE_DECLARATION(src1),
1368 IMAGE_DECLARATION(dst))
1369{
1370 // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1371 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1372
1373 // Compute starting address for matrix A and Matrix B
1374 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1375
1376 // Update address for the matrix A
1377 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1378
1379 // Update address for the matrix B
1380 src_addr.s1 += idx * sizeof(float);
1381
1382 // Address boundary for the matrix A
1383 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
1384
1385 // Initialize accumulators
1386 float acc00 = 0.0f;
1387 float acc01 = 0.0f;
1388
1389#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1390 float acc10 = 0.0f;
1391 float acc11 = 0.0f;
1392#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1393#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1394 float acc20 = 0.0f;
1395 float acc21 = 0.0f;
1396#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1397#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1398 float acc30 = 0.0f;
1399 float acc31 = 0.0f;
1400#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1401
1402 // A and B src indices get incremented at the same time.
1403 for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))
1404 {
1405 // Load values from matrix A
1406 float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1407
1408 // Load values from matrix B
1409 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1410 float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
1411 float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
1412 float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
1413
1414 // Multiply and accumulate
1415 acc00 = fma(a0.s0, b0.s0, acc00);
1416 acc00 = fma(a0.s1, b1.s0, acc00);
1417 acc00 = fma(a0.s2, b2.s0, acc00);
1418 acc00 = fma(a0.s3, b3.s0, acc00);
1419
1420 acc01 = fma(a0.s0, b0.s1, acc01);
1421 acc01 = fma(a0.s1, b1.s1, acc01);
1422 acc01 = fma(a0.s2, b2.s1, acc01);
1423 acc01 = fma(a0.s3, b3.s1, acc01);
1424
1425#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1426 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1427 acc10 = fma(a0.s0, b0.s0, acc10);
1428 acc10 = fma(a0.s1, b1.s0, acc10);
1429 acc10 = fma(a0.s2, b2.s0, acc10);
1430 acc10 = fma(a0.s3, b3.s0, acc10);
1431
1432 acc11 = fma(a0.s0, b0.s1, acc11);
1433 acc11 = fma(a0.s1, b1.s1, acc11);
1434 acc11 = fma(a0.s2, b2.s1, acc11);
1435 acc11 = fma(a0.s3, b3.s1, acc11);
1436#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1437#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1438 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1439 acc20 = fma(a0.s0, b0.s0, acc20);
1440 acc20 = fma(a0.s1, b1.s0, acc20);
1441 acc20 = fma(a0.s2, b2.s0, acc20);
1442 acc20 = fma(a0.s3, b3.s0, acc20);
1443
1444 acc21 = fma(a0.s0, b0.s1, acc21);
1445 acc21 = fma(a0.s1, b1.s1, acc21);
1446 acc21 = fma(a0.s2, b2.s1, acc21);
1447 acc21 = fma(a0.s3, b3.s1, acc21);
1448#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1449#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1450 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1451 acc30 = fma(a0.s0, b0.s0, acc30);
1452 acc30 = fma(a0.s1, b1.s0, acc30);
1453 acc30 = fma(a0.s2, b2.s0, acc30);
1454 acc30 = fma(a0.s3, b3.s0, acc30);
1455
1456 acc31 = fma(a0.s0, b0.s1, acc31);
1457 acc31 = fma(a0.s1, b1.s1, acc31);
1458 acc31 = fma(a0.s2, b2.s1, acc31);
1459 acc31 = fma(a0.s3, b3.s1, acc31);
1460#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1461 }
1462 // float size increment
1463 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))
1464 {
1465 // Load values from matrix A
1466 float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1467#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1468 float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1469#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1470#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1471 float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1472#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1473#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1474 float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1475#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1476 // Load values from matrix B
1477 float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
1478
1479 // Multiply and accumulate
1480 acc00 = fma(a0, b0.s0, acc00);
1481 acc01 = fma(a0, b0.s1, acc01);
1482#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1483 acc10 = fma(a1, b0.s0, acc10);
1484 acc11 = fma(a1, b0.s1, acc11);
1485#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1486#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1487 acc20 = fma(a2, b0.s0, acc20);
1488 acc21 = fma(a2, b0.s1, acc21);
1489#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1490#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1491 acc30 = fma(a3, b0.s0, acc30);
1492 acc31 = fma(a3, b0.s1, acc31);
1493#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1494 }
1495
1496 // Compute destination address
1497 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1498
1499 // Multiply by the weight of matrix-matrix product and store the result
1500#if defined(ALPHA)
1501 acc00 = acc00 * ALPHA;
1502 acc01 = acc01 * ALPHA;
1503#endif // defined(ALPHA)
1504 float2 acc0 = ((float2)(acc00, acc01));
1505 vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
1506#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1507#if defined(ALPHA)
1508 acc10 = acc10 * ALPHA;
1509 acc11 = acc11 * ALPHA;
1510#endif // defined(ALPHA)
1511 float2 acc1 = ((float2)(acc10, acc11));
1512 vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
1513#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1514#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1515#if defined(ALPHA)
1516 acc20 = acc20 * ALPHA;
1517 acc21 = acc21 * ALPHA;
1518#endif // defined(ALPHA)
1519 float2 acc2 = ((float2)(acc20, acc21));
1520 vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
1521#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1522#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1523#if defined(ALPHA)
1524 acc30 = acc30 * ALPHA;
1525 acc31 = acc31 * ALPHA;
1526#endif // defined(ALPHA)
1527 float2 acc3 = (float2)(acc30, acc31);
1528 vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
1529#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1530}
1531
1532#if defined(FIXED_POINT_POSITION)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001533/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001534 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001535 * @note This OpenCL kernel works with fixed point data types QS8
1536 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001537 * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001538 * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001539 * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001540 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001541 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001542 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1543 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1544 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1545 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1546 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1547 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1548 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1549 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1550 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1551 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1552 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1553 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1554 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1555 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1556 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1557 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1559 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001560__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001561 IMAGE_DECLARATION(src1),
1562 IMAGE_DECLARATION(dst))
1563{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001564 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001565
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001566 // Compute starting address for matrix A and Matrix B
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001567 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001568
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001569 // Update address for the matrix A
1570 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001571
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001572 // Update address for the matrix B
1573 src_addr.s1 += idx * sizeof(char);
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001574
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001575 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
1576
1577 short8 acc00 = 0;
1578 short8 acc01 = 0;
1579#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1580 short8 acc10 = 0;
1581 short8 acc11 = 0;
1582#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1583#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1584 short8 acc20 = 0;
1585 short8 acc21 = 0;
1586#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1587#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1588 short8 acc30 = 0;
1589 short8 acc31 = 0;
1590#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1591
1592 // This for loop performs 4 accumulations per iteration
1593 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001594 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001595 char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1596#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1597 char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1598#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1599#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1600 char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1601#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1602#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1603 char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1604#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001605 char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1606 char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001607
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001608 acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
1609 acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
1610 acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1611 acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
1612#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1613 acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);
1614 acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);
1615 acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1616 acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
1617#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1618#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1619 acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);
1620 acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);
1621 acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1622 acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
1623#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1624#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1625 acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);
1626 acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);
1627 acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1628 acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
1629#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001630 }
1631
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001632 // Left-over accumulations
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001633 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
1634 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001635 char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1636#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1637 char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1638#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1639#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1640 char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1641#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1642#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1643 char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1644#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001645 char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
1646
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001647 acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
1648 acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
1649#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1650 acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);
1651 acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);
1652#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1653#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1654 acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);
1655 acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);
1656#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1657#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1658 acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);
1659 acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);
1660#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001661 }
1662
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001663 // Compute destination address
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001664 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1665
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001666 // Multiply by the weight of matrix product and store the result
1667 char16 acc_qs8;
1668 acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001669#if defined(ALPHA)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001670 acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001671#endif // defined(ALPHA)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001672 vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001673#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1674 acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001675#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001676 acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001677#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001678 vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
1679#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1680#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1681 acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001682#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001683 acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001684#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001685 vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
1686#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1687#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1688 acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001689#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001690 acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001691#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001692 vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
1693#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001694}
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001695
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001696/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001697 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001698 * @note This OpenCL kernel works with fixed point data types QS16
1699 * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001700 * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001701 * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001702 * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001703 *
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001704 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001705 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1706 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1707 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1708 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1709 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1710 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
1711 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1712 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1713 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1714 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1715 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1716 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
1717 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1718 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1719 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1720 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1721 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1722 */
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001723__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001724 IMAGE_DECLARATION(src1),
1725 IMAGE_DECLARATION(dst))
1726{
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001727 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001728
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001729 // Compute starting address for matrix A and Matrix B
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001730 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001731
1732 // Update address for the matrix A
1733 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1734
1735 // Update address for the matrix B
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001736 src_addr.s1 += idx * sizeof(short);
1737
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001738 int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001739
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001740 int8 acc0 = 0;
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001741#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1742 int8 acc1 = 0;
1743#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1745 int8 acc2 = 0;
1746#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1747#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1748 int8 acc3 = 0;
1749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001750
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001751 // This for loop performs 4 accumulations per iteration
Georgios Pinitas96880cf2017-10-20 18:52:20 +01001752 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001753 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001754 short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1755#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1756 short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1757#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1758#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1759 short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1760#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1761#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1762 short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1763#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001764 short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
1765 short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001766
1767 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
1768 acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001769#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1770 acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);
1771 acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);
1772#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1773#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1774 acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);
1775 acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);
1776#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1777#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1778 acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);
1779 acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);
1780#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001781 }
1782
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001783 // Left-over accumulations
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001784 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
1785 {
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001786 short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1787#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1788 short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1789#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1790#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1791 short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1792#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1793#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1794 short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1795#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001796 short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
1797
1798 acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001799#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1800 acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);
1801#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1802#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1803 acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);
1804#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1805#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1806 acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);
1807#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001808 }
1809
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001810 // Compute destination address
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001811 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1812
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001813 // Multiply by the weight of matrix product and store the result
1814 short8 acc_qs16;
1815 acc_qs16 = convert_short8_sat(acc0);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001816#if defined(ALPHA)
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001817 acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001818#endif // defined(ALPHA)
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001819 vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001820#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1821 acc_qs16 = convert_short8_sat(acc1);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001822#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001823 acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001824#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001825 vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
1826#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1827#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1828 acc_qs16 = convert_short8_sat(acc2);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001829#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001830 acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001831#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001832 vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
1833#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1834#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1835 acc_qs16 = convert_short8_sat(acc3);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001836#if defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001837 acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001838#endif // defined(ALPHA)
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001839 vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
1840#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001841}
Gian Marco Iodiceedfa9f42017-08-15 11:45:22 +01001842#endif // defined(FIXED_POINT_POSITION)
1843#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001844
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001845#if defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001846/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
1847 *
1848 * @attention The beta's value need to be passed at compile time using -DBETA
1849 *
1850 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
1851 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1852 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1853 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1854 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1855 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001856 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001857 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1858 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1859 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1860 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1861 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1862 */
1863__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
1864 IMAGE_DECLARATION(dst))
1865{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001866 // Compute source and destination addresses
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001867 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1868 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1869
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001870 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001871 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
1872
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001873 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001874 float4 c = vload4(0, (__global float *)src.ptr);
1875
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001876 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001877 float4 out = alpha_ab + (float4)BETA * c;
1878
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001879 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001880 vstore4(out, 0, (__global float *)dst.ptr);
1881}
1882
1883/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
1884 *
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001885 * @attention The beta's value need to be passed at compile time using -DBETA
1886 *
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001887 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
1888 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1889 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1890 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1891 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1892 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001893 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001894 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1895 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1896 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1897 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1898 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1899 */
1900__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
1901 IMAGE_DECLARATION(dst))
1902{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001903 // Compute source and destination addresses
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001904 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1905 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1906
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001907 // Load values from A x B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001908 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
1909
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001910 // Load values from Matrix C
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001911 half8 c = vload8(0, (__global half *)src.ptr);
1912
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001913 // Computes alpha * axb + beta * c
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001914 half8 out = alpha_ab + (half8)BETA * c;
1915
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001916 // Store final result in axb matrix
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001917 vstore8(out, 0, (__global half *)dst.ptr);
1918}
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001919
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001920#if defined(FIXED_POINT_POSITION)
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001921/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
1922 *
1923 * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
1924 *
1925 * @note: BETA must be passed in 8 bit fixed point format
1926 *
1927 * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8
1928 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1929 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1930 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1931 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1932 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
1933 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
1934 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1935 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1936 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1937 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1938 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1939 */
1940__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
1941 IMAGE_DECLARATION(dst))
1942{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001943 // Compute source and destination addresses
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001944 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1945 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1946
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001947 // Load values from A x B
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001948 char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
1949
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001950 // Load values from Matrix C
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001951 char16 c = vload16(0, (__global char *)src.ptr);
1952
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001953 // Computes alpha * axb + beta * c
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001954 char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
1955
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001956 // Store final result in axb matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01001957 vstore16(out, 0, (__global char *)dst.ptr);
1958}
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001959
1960/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
1961 *
1962 * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
1963 *
1964 * @note: BETA must be passed in 16 bit fixed point format
1965 *
1966 * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16
1967 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
1968 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1969 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
1970 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1971 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
1972 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
1973 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1974 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1975 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1976 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1977 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1978 */
1979__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
1980 IMAGE_DECLARATION(dst))
1981{
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001982 // Compute source and destination addresses
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001983 Image src = CONVERT_TO_IMAGE_STRUCT(src);
1984 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1985
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001986 // Load values from A x B
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001987 short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
1988
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001989 // Load values from Matrix C
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001990 short8 c = vload8(0, (__global short *)src.ptr);
1991
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001992 // Computes alpha * axb + beta * c
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001993 short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
1994
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001995 // Store final result in axb matrix
Gian Marco Iodice8a383692017-07-03 17:41:47 +01001996 vstore8(out, 0, (__global short *)dst.ptr);
1997}
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00001998#endif // defined(FIXED_POINT_POSITION)
1999#endif // defined(BETA)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002000
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00002001#if defined(WIDTH_VECTOR_A)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002002/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
2003 *
2004 * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
2005 *
2006 * @attention The input A and matrix B must not be reshaped
2007 *
2008 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
2009 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
2010 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2011 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
2012 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2013 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01002014 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002015 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
2016 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2017 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
2018 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2019 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
2020 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2021 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +01002022 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002023 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
2024 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2025 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
2026 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2027 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
2028 */
2029__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
2030 TENSOR3D_DECLARATION(src1),
2031 IMAGE_DECLARATION(dst))
2032{
2033 int idx = get_global_id(0) * 4;
2034 int idy = get_global_id(1);
2035
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00002036 // Compute the address for the vector A and matrix B
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002037 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
2038 src_addr.s1 += idx * sizeof(float);
2039
2040 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
2041
2042 float4 acc = 0.0f;
2043
Georgios Pinitas96880cf2017-10-20 18:52:20 +01002044 for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002045 {
2046 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
2047 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
2048 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
2049
2050 acc += b0 * (float4)a0.s0;
2051 acc += b1 * (float4)a0.s1;
2052 }
2053
2054 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
2055 {
2056 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
2057 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
2058
2059 acc += b0 * (float4)a0;
2060 }
2061
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00002062 // Compute destination address
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002063 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2064
2065 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
2066}
Anton Lokhmotov3e80c7f2017-11-20 11:02:10 +00002067#endif // defined(WIDTH_VECTOR_A)
2068
2069/** This kernel accumulates each row with the biases vector.
2070 *
2071 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
2072 * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
2073 *
2074 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
2075 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
2076 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
2077 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
2078 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2079 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
2080 * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
2081 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
2082 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
2083 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
2084 */
2085#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
2086__kernel void gemm_accumulate_biases(
2087 IMAGE_DECLARATION(accum),
2088 VECTOR_DECLARATION(biases))
2089{
2090 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
2091 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
2092
2093 // Vector size, i.e. number of vector elements.
2094 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
2095 accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
2096 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
2097 biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
2098#ifdef FIXED_POINT_POSITION
2099 accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, VECTOR_SIZE);
2100#else // FIXED_POINT_POSITION
2101 accum_value = biases_value + accum_value;
2102#endif // FIXED_POINT_POSITION
2103 // Store result in the accumulate buffer
2104 VSTORE(VECTOR_SIZE)
2105 (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
2106}
2107#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)