blob: da915778e7e63e61cecae6726265f8468a0219a7 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco7b4d5472018-01-10 15:56:30 +00002 * Copyright (c) 2017-2018 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco05288a22017-11-21 10:57:50 +000026
Gian Marco19835e52018-01-30 13:35:54 +000027#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +000028/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco19835e52018-01-30 13:35:54 +000029 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
Gian Marco05288a22017-11-21 10:57:50 +000030 *
Gian Marco19835e52018-01-30 13:35:54 +000031 * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
32 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
33 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
Gian Marco05288a22017-11-21 10:57:50 +000034 *
35 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
36 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
37 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
38 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
39 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
40 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
41 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
42 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
43 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
44 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
45 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
46 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
47 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
48 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
49 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
50 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
51 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
52 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
53 */
Gian Marco19835e52018-01-30 13:35:54 +000054__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
55 IMAGE_DECLARATION(src1),
56 IMAGE_DECLARATION(dst))
Gian Marco05288a22017-11-21 10:57:50 +000057{
Gian Marco19835e52018-01-30 13:35:54 +000058 int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
59 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marco05288a22017-11-21 10:57:50 +000060
Gian Marco19835e52018-01-30 13:35:54 +000061 // Offset
62 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
63 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
64
65 // src_addr_a = address of matrix A
66 // src_addr_b = address of matrix B
67 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
68 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco05288a22017-11-21 10:57:50 +000069
70 // Compute end row address for matrix B
Gian Marco19835e52018-01-30 13:35:54 +000071 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
72
73 src_addr_a += offset_row_a;
74 src_addr_b += offset_row_b;
Gian Marco05288a22017-11-21 10:57:50 +000075
76 // Reset accumulators
Gian Marco19835e52018-01-30 13:35:54 +000077 int4 c00 = 0;
78 int4 c10 = 0;
79 int4 c20 = 0;
80 int4 c30 = 0;
Gian Marco05288a22017-11-21 10:57:50 +000081
Gian Marco19835e52018-01-30 13:35:54 +000082 for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +000083 {
84 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +000085 int4 a0 = convert_int4(vload4(0, src_addr_a));
86 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +000087
Gian Marco19835e52018-01-30 13:35:54 +000088 c00 += (int4)a0.s0 * b0;
89 c10 += (int4)a0.s1 * b0;
90 c20 += (int4)a0.s2 * b0;
91 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +000092
Gian Marco19835e52018-01-30 13:35:54 +000093 a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
94 b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
Gian Marco05288a22017-11-21 10:57:50 +000095
Gian Marco19835e52018-01-30 13:35:54 +000096 c00 += (int4)a0.s0 * b0;
97 c10 += (int4)a0.s1 * b0;
98 c20 += (int4)a0.s2 * b0;
99 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000100 }
101
Gian Marco19835e52018-01-30 13:35:54 +0000102 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
Gian Marco05288a22017-11-21 10:57:50 +0000103 {
104 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000105 int4 a0 = convert_int4(vload4(0, src_addr_a));
106 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000107
Gian Marco19835e52018-01-30 13:35:54 +0000108 c00 += (int4)a0.s0 * b0;
109 c10 += (int4)a0.s1 * b0;
110 c20 += (int4)a0.s2 * b0;
111 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000112 }
113
114 // Compute destination address
115 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
116
Gian Marco19835e52018-01-30 13:35:54 +0000117 // Store 4x4 block
118 vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));
119 vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));
120 vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));
121 vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));
Gian Marco05288a22017-11-21 10:57:50 +0000122}
Gian Marco19835e52018-01-30 13:35:54 +0000123
124/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
125 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
126 *
127 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
128 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
129 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
130 *
131 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
132 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
133 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
134 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
135 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
136 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
137 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
138 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
139 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
140 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
141 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
142 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
143 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
144 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
145 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
146 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
147 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
148 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
149 */
150__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
151 IMAGE_DECLARATION(src1),
152 IMAGE_DECLARATION(dst))
153{
154 int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
155 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
156
157 // Offset
158 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
159 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
160
161 // src_addr_a = address of matrix A
162 // src_addr_b = address of matrix B
163 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
164 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
165
166 // Compute end row address for matrix B
167 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
168
169 src_addr_a += offset_row_a;
170 src_addr_b += offset_row_b;
171
172 // Reset accumulators
173 uint c00 = 0;
174 uint c01 = 0;
175 uint c02 = 0;
176 uint c03 = 0;
177 uint c10 = 0;
178 uint c11 = 0;
179 uint c12 = 0;
180 uint c13 = 0;
181 uint c20 = 0;
182 uint c21 = 0;
183 uint c22 = 0;
184 uint c23 = 0;
185 uint c30 = 0;
186 uint c31 = 0;
187 uint c32 = 0;
188 uint c33 = 0;
189
190#if MULT_INTERLEAVE4X4_HEIGHT == 1
191 for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
192 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000193#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
194 // Load values from matrix A (interleaved) and matrix B (transposed)
195 uchar16 a0 = vload16(0, src_addr_a);
196 uchar4 b0 = vload4(0, src_addr_b);
197 uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
198 uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
199 uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
200
201 // Accumulate
202 c00 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
203 c01 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
204 c02 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
205 c03 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
206
207 c10 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
208 c11 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
209 c12 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
210 c13 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
211
212 c20 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
213 c21 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
214 c22 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
215 c23 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
216
217 c30 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
218 c31 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
219 c32 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
220 c33 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
221
222 // Load values from matrix A (interleaved) and matrix B (transposed)
223 a0 = vload16(0, src_addr_a + 16);
224 b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
225 b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
226 b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
227 b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
228
229 // Accumulate
230 c00 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
231 c01 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
232 c02 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
233 c03 += arm_dot((uchar4)(a0.s0, a0.s4, a0.s8, a0.sC), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
234
235 c10 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
236 c11 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
237 c12 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
238 c13 += arm_dot((uchar4)(a0.s1, a0.s5, a0.s9, a0.sD), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
239
240 c20 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
241 c21 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
242 c22 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
243 c23 += arm_dot((uchar4)(a0.s2, a0.s6, a0.sA, a0.sE), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
244
245 c30 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0));
246 c31 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1));
247 c32 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2));
248 c33 += arm_dot((uchar4)(a0.s3, a0.s7, a0.sB, a0.sF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3));
249#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco19835e52018-01-30 13:35:54 +0000250 // Load values from matrix A (interleaved) and matrix B (transposed)
251 uchar16 a0 = vload16(0, src_addr_a);
252 uchar4 b0 = vload4(0, src_addr_b);
253
254 c00 += (ushort)a0.s0 * b0.s0;
255 c01 += (ushort)a0.s0 * b0.s1;
256 c02 += (ushort)a0.s0 * b0.s2;
257 c03 += (ushort)a0.s0 * b0.s3;
258
259 c10 += (ushort)a0.s1 * b0.s0;
260 c11 += (ushort)a0.s1 * b0.s1;
261 c12 += (ushort)a0.s1 * b0.s2;
262 c13 += (ushort)a0.s1 * b0.s3;
263
264 c20 += (ushort)a0.s2 * b0.s0;
265 c21 += (ushort)a0.s2 * b0.s1;
266 c22 += (ushort)a0.s2 * b0.s2;
267 c23 += (ushort)a0.s2 * b0.s3;
268
269 c30 += (ushort)a0.s3 * b0.s0;
270 c31 += (ushort)a0.s3 * b0.s1;
271 c32 += (ushort)a0.s3 * b0.s2;
272 c33 += (ushort)a0.s3 * b0.s3;
273
274 // Load values from matrix B (transposed)
275 b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
276
277 c00 += (ushort)a0.s4 * b0.s0;
278 c01 += (ushort)a0.s4 * b0.s1;
279 c02 += (ushort)a0.s4 * b0.s2;
280 c03 += (ushort)a0.s4 * b0.s3;
281
282 c10 += (ushort)a0.s5 * b0.s0;
283 c11 += (ushort)a0.s5 * b0.s1;
284 c12 += (ushort)a0.s5 * b0.s2;
285 c13 += (ushort)a0.s5 * b0.s3;
286
287 c20 += (ushort)a0.s6 * b0.s0;
288 c21 += (ushort)a0.s6 * b0.s1;
289 c22 += (ushort)a0.s6 * b0.s2;
290 c23 += (ushort)a0.s6 * b0.s3;
291
292 c30 += (ushort)a0.s7 * b0.s0;
293 c31 += (ushort)a0.s7 * b0.s1;
294 c32 += (ushort)a0.s7 * b0.s2;
295 c33 += (ushort)a0.s7 * b0.s3;
296
297 // Load values from matrix B (transposed)
298 b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
299
300 c00 += (ushort)a0.s8 * b0.s0;
301 c01 += (ushort)a0.s8 * b0.s1;
302 c02 += (ushort)a0.s8 * b0.s2;
303 c03 += (ushort)a0.s8 * b0.s3;
304
305 c10 += (ushort)a0.s9 * b0.s0;
306 c11 += (ushort)a0.s9 * b0.s1;
307 c12 += (ushort)a0.s9 * b0.s2;
308 c13 += (ushort)a0.s9 * b0.s3;
309
310 c20 += (ushort)a0.sA * b0.s0;
311 c21 += (ushort)a0.sA * b0.s1;
312 c22 += (ushort)a0.sA * b0.s2;
313 c23 += (ushort)a0.sA * b0.s3;
314
315 c30 += (ushort)a0.sB * b0.s0;
316 c31 += (ushort)a0.sB * b0.s1;
317 c32 += (ushort)a0.sB * b0.s2;
318 c33 += (ushort)a0.sB * b0.s3;
319
320 // Load values from matrix B (transposed)
321 b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
322
323 c00 += (ushort)a0.sC * b0.s0;
324 c01 += (ushort)a0.sC * b0.s1;
325 c02 += (ushort)a0.sC * b0.s2;
326 c03 += (ushort)a0.sC * b0.s3;
327
328 c10 += (ushort)a0.sD * b0.s0;
329 c11 += (ushort)a0.sD * b0.s1;
330 c12 += (ushort)a0.sD * b0.s2;
331 c13 += (ushort)a0.sD * b0.s3;
332
333 c20 += (ushort)a0.sE * b0.s0;
334 c21 += (ushort)a0.sE * b0.s1;
335 c22 += (ushort)a0.sE * b0.s2;
336 c23 += (ushort)a0.sE * b0.s3;
337
338 c30 += (ushort)a0.sF * b0.s0;
339 c31 += (ushort)a0.sF * b0.s1;
340 c32 += (ushort)a0.sF * b0.s2;
341 c33 += (ushort)a0.sF * b0.s3;
342
343 // Load values from matrix A (interleaved) and matrix B (transposed)
344 a0 = vload16(0, src_addr_a + 16);
345 b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
346
347 c00 += (ushort)a0.s0 * b0.s0;
348 c01 += (ushort)a0.s0 * b0.s1;
349 c02 += (ushort)a0.s0 * b0.s2;
350 c03 += (ushort)a0.s0 * b0.s3;
351
352 c10 += (ushort)a0.s1 * b0.s0;
353 c11 += (ushort)a0.s1 * b0.s1;
354 c12 += (ushort)a0.s1 * b0.s2;
355 c13 += (ushort)a0.s1 * b0.s3;
356
357 c20 += (ushort)a0.s2 * b0.s0;
358 c21 += (ushort)a0.s2 * b0.s1;
359 c22 += (ushort)a0.s2 * b0.s2;
360 c23 += (ushort)a0.s2 * b0.s3;
361
362 c30 += (ushort)a0.s3 * b0.s0;
363 c31 += (ushort)a0.s3 * b0.s1;
364 c32 += (ushort)a0.s3 * b0.s2;
365 c33 += (ushort)a0.s3 * b0.s3;
366
367 // Load values from matrix B (transposed)
368 b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
369
370 c00 += (ushort)a0.s4 * b0.s0;
371 c01 += (ushort)a0.s4 * b0.s1;
372 c02 += (ushort)a0.s4 * b0.s2;
373 c03 += (ushort)a0.s4 * b0.s3;
374
375 c10 += (ushort)a0.s5 * b0.s0;
376 c11 += (ushort)a0.s5 * b0.s1;
377 c12 += (ushort)a0.s5 * b0.s2;
378 c13 += (ushort)a0.s5 * b0.s3;
379
380 c20 += (ushort)a0.s6 * b0.s0;
381 c21 += (ushort)a0.s6 * b0.s1;
382 c22 += (ushort)a0.s6 * b0.s2;
383 c23 += (ushort)a0.s6 * b0.s3;
384
385 c30 += (ushort)a0.s7 * b0.s0;
386 c31 += (ushort)a0.s7 * b0.s1;
387 c32 += (ushort)a0.s7 * b0.s2;
388 c33 += (ushort)a0.s7 * b0.s3;
389
390 // Load values from matrix B (transposed)
391 b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
392
393 c00 += (ushort)a0.s8 * b0.s0;
394 c01 += (ushort)a0.s8 * b0.s1;
395 c02 += (ushort)a0.s8 * b0.s2;
396 c03 += (ushort)a0.s8 * b0.s3;
397
398 c10 += (ushort)a0.s9 * b0.s0;
399 c11 += (ushort)a0.s9 * b0.s1;
400 c12 += (ushort)a0.s9 * b0.s2;
401 c13 += (ushort)a0.s9 * b0.s3;
402
403 c20 += (ushort)a0.sA * b0.s0;
404 c21 += (ushort)a0.sA * b0.s1;
405 c22 += (ushort)a0.sA * b0.s2;
406 c23 += (ushort)a0.sA * b0.s3;
407
408 c30 += (ushort)a0.sB * b0.s0;
409 c31 += (ushort)a0.sB * b0.s1;
410 c32 += (ushort)a0.sB * b0.s2;
411 c33 += (ushort)a0.sB * b0.s3;
412
413 // Load values from matrix B (transposed)
414 b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
415
416 c00 += (ushort)a0.sC * b0.s0;
417 c01 += (ushort)a0.sC * b0.s1;
418 c02 += (ushort)a0.sC * b0.s2;
419 c03 += (ushort)a0.sC * b0.s3;
420
421 c10 += (ushort)a0.sD * b0.s0;
422 c11 += (ushort)a0.sD * b0.s1;
423 c12 += (ushort)a0.sD * b0.s2;
424 c13 += (ushort)a0.sD * b0.s3;
425
426 c20 += (ushort)a0.sE * b0.s0;
427 c21 += (ushort)a0.sE * b0.s1;
428 c22 += (ushort)a0.sE * b0.s2;
429 c23 += (ushort)a0.sE * b0.s3;
430
431 c30 += (ushort)a0.sF * b0.s0;
432 c31 += (ushort)a0.sF * b0.s1;
433 c32 += (ushort)a0.sF * b0.s2;
434 c33 += (ushort)a0.sF * b0.s3;
Michalis Spyroue03342e2018-01-15 14:39:13 +0000435#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco19835e52018-01-30 13:35:54 +0000436 }
437#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
438
439 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
440 {
441 // Load values from matrix A (interleaved) and matrix B (transposed)
442 uchar4 a0 = vload4(0, src_addr_a);
443 uchar4 b0 = vload4(0, src_addr_b);
444
445 c00 += (ushort)a0.s0 * b0.s0;
446 c01 += (ushort)a0.s0 * b0.s1;
447 c02 += (ushort)a0.s0 * b0.s2;
448 c03 += (ushort)a0.s0 * b0.s3;
449
450 c10 += (ushort)a0.s1 * b0.s0;
451 c11 += (ushort)a0.s1 * b0.s1;
452 c12 += (ushort)a0.s1 * b0.s2;
453 c13 += (ushort)a0.s1 * b0.s3;
454
455 c20 += (ushort)a0.s2 * b0.s0;
456 c21 += (ushort)a0.s2 * b0.s1;
457 c22 += (ushort)a0.s2 * b0.s2;
458 c23 += (ushort)a0.s2 * b0.s3;
459
460 c30 += (ushort)a0.s3 * b0.s0;
461 c31 += (ushort)a0.s3 * b0.s1;
462 c32 += (ushort)a0.s3 * b0.s2;
463 c33 += (ushort)a0.s3 * b0.s3;
464 }
465
466 // Compute destination address
467 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
468
469 // Store 4x4 block
470 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
471 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
472 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
473 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
474}
475#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000476
477#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
478#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
479#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
480#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
481/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
482 *
483 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
484 *
485 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
486 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
487 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
488 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
489 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
490 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
491 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
492 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
493 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
494 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
495 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
496 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
497 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
498 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
499 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
500 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
501 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
502 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
503 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000504__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
505 IMAGE_DECLARATION(src1),
506 IMAGE_DECLARATION(dst))
Gian Marco05288a22017-11-21 10:57:50 +0000507{
508 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
509
510 // Compute starting address for matrix A and Matrix B
511 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
512
513 // Update address for the matrix A
514 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
515
516 // Update address for the matrix B
517 src_addr.s1 += idx;
518
519 int end_row_vec_a = src_addr.s0 + COLS_A;
520
521 VECTOR_UINT acc0 = 0;
522#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
523 VECTOR_UINT acc1 = 0;
524#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
525#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
526 VECTOR_UINT acc2 = 0;
527#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
528#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
529 VECTOR_UINT acc3 = 0;
530#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000531#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
532 VECTOR_UINT acc4 = 0;
533#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000534
535 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
536 {
537 // Load values from matrix A
538 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
539#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
540 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
541#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
542#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
543 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
544#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
545#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
546 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
547#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000548#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
549 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
550#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000551 // Load values from matrix B
552 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
553 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
554
555 // Accumulate
556 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
557 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
558#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
559 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
560 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
561#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
562#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
563 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
564 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
565#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
566#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
567 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
568 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
569#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000570#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
571 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
572 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
573#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000574 }
575
576 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
577 {
578 // Load values from matrix A
579 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
580#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
581 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
582#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
583#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
584 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
585#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
586#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
587 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
588#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000589#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
590 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
591#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000592 // Load values from matrix B
593 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
594
595 // Accumulate
596 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
597#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
598 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
599#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
600#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
601 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
602#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
603#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
604 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
605#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000606#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
607 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
608#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000609 }
610
611 // Compute destination address
612 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
613
614 // Store the result
615 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
616 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
617#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
618 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
619 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
620#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
621#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
622 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
623 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
624#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
625#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
626 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
627 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
628#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000629#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
630 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
631 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
632#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
633}
634
635/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
636 *
637 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
638 *
639 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
640 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
641 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
642 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
643 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
644 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
645 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
646 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
647 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
648 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
649 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
650 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
651 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
652 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
653 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
654 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
655 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
656 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
657 */
658__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
659 IMAGE_DECLARATION(src1),
660 IMAGE_DECLARATION(dst))
661{
662 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
663
664 // Compute starting address for matrix A and Matrix B
665 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
666
667 // Update address for the matrix A
668 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
669
670 // Update address for the matrix B
671 src_addr.s1 += idx;
672
673 int end_row_vec_a = src_addr.s0 + COLS_A;
674
675 uint acc00 = 0;
676 uint acc01 = 0;
677 uint acc02 = 0;
678 uint acc03 = 0;
679#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
680 uint acc10 = 0;
681 uint acc11 = 0;
682 uint acc12 = 0;
683 uint acc13 = 0;
684#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
685#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
686 uint acc20 = 0;
687 uint acc21 = 0;
688 uint acc22 = 0;
689 uint acc23 = 0;
690#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
691#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
692 uint acc30 = 0;
693 uint acc31 = 0;
694 uint acc32 = 0;
695 uint acc33 = 0;
696#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
697#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
698 uint acc40 = 0;
699 uint acc41 = 0;
700 uint acc42 = 0;
701 uint acc43 = 0;
702#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
703
704 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
705 {
706 // Load values from matrix A
707 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
708#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
709 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
710#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
711#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
712 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
713#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
714#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
715 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
716#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
717#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
718 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
719#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
720 // Load values from matrix B
721 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
722 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
723 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
724 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
725
726 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000727#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
728 // Accumulate
729 acc00 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a0);
730 acc01 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a0);
731 acc02 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a0);
732 acc03 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a0);
733#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000734 // Accumulate
735 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
736 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
737 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
738 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
739
740 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
741 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
742 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
743 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
744
745 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
746 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
747 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
748 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
749
750 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
751 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
752 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
753 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
754
755 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
756 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
757 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
758 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
Michalis Spyroue03342e2018-01-15 14:39:13 +0000759#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000760 }
761#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
762 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000763#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
764 // Accumulate
765 acc10 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a1);
766 acc11 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a1);
767 acc12 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a1);
768 acc13 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a1);
769#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000770 // Accumulate
771 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
772 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
773 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
774 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
775
776 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
777 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
778 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
779 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
780
781 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
782 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
783 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
784 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
785
786 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
787 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
788 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
789 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
790
791 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
792 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
793 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
794 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
Michalis Spyroue03342e2018-01-15 14:39:13 +0000795#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000796 }
797#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
798#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
799 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000800#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
801 // Accumulate
802 acc20 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a2);
803 acc21 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a2);
804 acc22 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a2);
805 acc23 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a2);
806#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000807 // Accumulate
808 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
809 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
810 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
811 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
812
813 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
814 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
815 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
816 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
817
818 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
819 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
820 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
821 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
822
823 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
824 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
825 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
826 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
827
828 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
829 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
830 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
831 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
Michalis Spyroue03342e2018-01-15 14:39:13 +0000832#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000833 }
834#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
835#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
836 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000837#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
838 // Accumulate
839 acc30 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a3);
840 acc31 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a3);
841 acc32 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a3);
842 acc33 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a3);
843#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000844 // Accumulate
845 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
846 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
847 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
848 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
849
850 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
851 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
852 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
853 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
854
855 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
856 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
857 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
858 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
859
860 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
861 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
862 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
863 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
864
865 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
866 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
867 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
868 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
Michalis Spyroue03342e2018-01-15 14:39:13 +0000869#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000870 }
871#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
872#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
873 {
Michalis Spyroue03342e2018-01-15 14:39:13 +0000874#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
875 // Accumulate
876 acc40 += arm_dot((uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), a4);
877 acc41 += arm_dot((uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), a4);
878 acc42 += arm_dot((uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), a4);
879 acc43 += arm_dot((uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), a4);
880#else // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000881 // Accumulate
882 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
883 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
884 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
885 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
886
887 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
888 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
889 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
890 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
891
892 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
893 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
894 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
895 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
896
897 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
898 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
899 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
900 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
901
902 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
903 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
904 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
905 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
Michalis Spyroue03342e2018-01-15 14:39:13 +0000906#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Gian Marco7b4d5472018-01-10 15:56:30 +0000907 }
908#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
909 }
910
911 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
912 {
913 // Load values from matrix A
914 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
915#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
916 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
917#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
918#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
919 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
920#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
921#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
922 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
923#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
924#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
925 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
926#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
927 // Load values from matrix B
928 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
929
930 // Accumulate
931 {
932 // Accumulate
933 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
934 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
935 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
936 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
937
938 acc00 += ((uint)tmp0);
939 acc01 += ((uint)tmp1);
940 acc02 += ((uint)tmp2);
941 acc03 += ((uint)tmp3);
942 }
943#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
944 {
945 // Accumulate
946 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
947 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
948 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
949 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
950
951 acc10 += ((uint)tmp0);
952 acc11 += ((uint)tmp1);
953 acc12 += ((uint)tmp2);
954 acc13 += ((uint)tmp3);
955 }
956#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
957#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
958 {
959 // Accumulate
960 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
961 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
962 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
963 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
964
965 acc20 += ((uint)tmp0);
966 acc21 += ((uint)tmp1);
967 acc22 += ((uint)tmp2);
968 acc23 += ((uint)tmp3);
969 }
970#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
971#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
972 {
973 // Accumulate
974 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
975 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
976 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
977 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
978
979 acc30 += ((uint)tmp0);
980 acc31 += ((uint)tmp1);
981 acc32 += ((uint)tmp2);
982 acc33 += ((uint)tmp3);
983 }
984#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
985#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
986 {
987 // Accumulate
988 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
989 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
990 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
991 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
992
993 acc40 += ((uint)tmp0);
994 acc41 += ((uint)tmp1);
995 acc42 += ((uint)tmp2);
996 acc43 += ((uint)tmp3);
997 }
998#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
999 }
1000
1001 // Compute destination address
1002 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1003
1004 // Store the result
1005 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
1006#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1007 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
1008#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1009#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1010 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
1011#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1012#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1013 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
1014#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1015#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1016 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
1017#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +00001018}
1019#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
1020
1021#if defined(COLS_A)
1022/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
1023 *
1024 * @note This stage is needed to handle the offset of matrix product
1025 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1026 *
1027 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1028 *
1029 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1030 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1031 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1032 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1033 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1034 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1035 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1036 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1037 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1038 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1039 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1040 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1041 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1042 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1043 */
1044__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
1045 IMAGE_DECLARATION(dst))
1046{
1047 // Compute source and destination addresses
1048 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1049 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1050
1051 uint4 sum_row_u32 = (uint4)0;
1052 uint sum_row = 0;
1053
1054 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1055
1056 int i = 0;
1057
1058 // This for loop performs 16 accumulations
1059 for(; i <= ((int)COLS_A - 16); i += 16)
1060 {
1061 const uchar16 a0_u8 = vload16(0, matrix_a + i);
1062
1063 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
1064 }
1065
1066 // This for loop performs the leftover accumulations
1067 for(; i < COLS_A; ++i)
1068 {
1069 sum_row += matrix_a[i];
1070 }
1071
1072 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
1073
1074 *((__global int *)dst.ptr) = (int)sum_row;
1075}
1076#endif // defined(COLS_A)
1077
1078#if defined(COLS_B) && defined(ROWS_B)
1079/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
1080 *
1081 * @note This stage is needed to handle the offset of matrix product
1082 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1083 *
1084 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
1085 *
1086 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1087 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1088 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1089 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1090 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1091 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1092 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1093 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1094 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1095 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1096 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1097 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1098 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1099 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1100 */
1101__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
1102 IMAGE_DECLARATION(dst))
1103{
1104 // Compute source and destination addresses
1105 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1106 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1107
1108 uint16 sum_col_u32 = (uint16)0;
1109
1110 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
1111
1112 int i = 0;
1113 // This for loop performs 4 accumulations
1114 for(; i <= ((int)ROWS_B - 4); i += 4)
1115 {
1116 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
1117 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
1118 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
1119 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
1120
1121 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
1122
1123 matrix_b += 4 * src_stride_y;
1124 }
1125
1126 // This for loop perfoms the leftover accumulations
1127 for(; i < (int)ROWS_B; ++i)
1128 {
1129 const uchar16 b0_u8 = vload16(0, matrix_b);
1130
1131 sum_col_u32 += convert_uint16(b0_u8);
1132
1133 matrix_b += src_stride_y;
1134 }
1135
1136 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
1137}
1138#endif // defined(COLS_B) && defined(ROWS_B)
1139
1140#if defined(K_OFFSET)
1141/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
1142 *
1143 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
1144 * and adds to it the offset contribution of matrix A and matrix B in-place.
1145 *
1146 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1147 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1148 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07001149 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00001150 *
1151 * The final result is:
1152 *
1153 * mm_result[i][k] = mm_result[i][k] +
1154 * (sum_col[k] * A_OFFSET) +
1155 * (sum_row[i] * B_OFFSET) +
1156 * (K_OFFSET)
1157 *
1158 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
1159 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
1160 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
1161 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1162 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
1163 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
1164 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
1165 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1166 * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1167 * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
1168 * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1169 * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1170 * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1171 * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1172 * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1173 * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
1174 * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1175 * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1176 * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1177 * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1178 */
1179__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
1180#if defined(A_OFFSET)
1181 ,
1182 IMAGE_DECLARATION(sum_col)
1183#endif // defined(A_OFFSET)
1184#if defined(B_OFFSET)
1185 ,
1186 IMAGE_DECLARATION(sum_row)
1187#endif // defined(B_OFFSET)
1188 )
1189{
1190 Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
1191
Gian Marco19835e52018-01-30 13:35:54 +00001192 int4 a_offset_s32 = (int4)0;
1193 int4 b_offset_s32 = (int4)0;
Gian Marco05288a22017-11-21 10:57:50 +00001194
1195#if defined(A_OFFSET)
1196 Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
1197
1198 // Compute the offset contribution due to A_OFFSET
Chunosov5124be52017-11-22 20:42:13 +07001199#if defined(SUM_COL_HAS_BATCHES)
Gian Marco19835e52018-01-30 13:35:54 +00001200 a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
Chunosov5124be52017-11-22 20:42:13 +07001201#else // defined(MATRIX_B_HAS_BATCHES)
Gian Marco19835e52018-01-30 13:35:54 +00001202 a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));
Chunosov5124be52017-11-22 20:42:13 +07001203#endif // defined(MATRIX_B_HAS_BATCHES)
1204
Gian Marco19835e52018-01-30 13:35:54 +00001205 a_offset_s32 *= (int4)A_OFFSET;
Gian Marco05288a22017-11-21 10:57:50 +00001206#endif // defined(A_OFFSET)
1207
1208#if defined(B_OFFSET)
1209 Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
1210
1211 // Compute the offset contribution due to B_OFFSET
Gian Marco19835e52018-01-30 13:35:54 +00001212 b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
1213 b_offset_s32 *= (int4)B_OFFSET;
Gian Marco05288a22017-11-21 10:57:50 +00001214#endif // defined(B_OFFSET)
1215
Gian Marco19835e52018-01-30 13:35:54 +00001216 const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
Gian Marco05288a22017-11-21 10:57:50 +00001217
Gian Marco19835e52018-01-30 13:35:54 +00001218 int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);
Gian Marco05288a22017-11-21 10:57:50 +00001219
1220 // Add the offset terms to GEMM's result
1221 in_s32 += offset_term_s32;
1222
1223 // Store the result with the offset contribution
Gian Marco19835e52018-01-30 13:35:54 +00001224 vstore4(in_s32, 0, (__global int *)mm_result.ptr);
Gian Marco05288a22017-11-21 10:57:50 +00001225}
1226#endif // defined(K_OFFSET)
1227
1228#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1229/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1230 *
1231 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
1232 * The following computations will be performed by the kernel:
1233 *
1234 * -# Add offset terms to final result
1235 * -# Multiply each entry of result by result_mult_int
1236 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
1237 * -# Shift the int32 accumulator by result_shift
1238 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
1239 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1240 *
1241 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1242 *
1243 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1244 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1245 * These values can be used to implement "rectified linear unit" activation functions
1246 *
1247 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1248 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1249 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1250 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1251 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1252 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1253 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1254 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1255 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
1256 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
1257 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
1258 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
1259 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1260 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1261 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1262 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1263 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1264 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1265 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1266 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1267 */
1268__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
1269#if defined(ADD_BIAS)
1270 VECTOR_DECLARATION(biases),
1271#endif // defined(ADD_BIAS)
1272 TENSOR3D_DECLARATION(dst))
1273{
1274 // Compute source and destination addresses
1275 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1276 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
1277#if defined(ADD_BIAS)
1278 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1279#endif // defined(ADD_BIAS)
1280
1281 int16 input_values = vload16(0, (__global int *)src.ptr);
1282
Gian Marco58c57942017-11-28 09:10:03 +00001283 // Add the offset terms to GEMM's result
1284 input_values += (int16)RESULT_OFFSET;
1285
Gian Marco05288a22017-11-21 10:57:50 +00001286#if defined(ADD_BIAS)
1287 // Add bias
1288 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
1289 input_values += (int16)biases_values;
1290#endif // defined(ADD_BIAS)
1291
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00001292 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00001293 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00001294
Gian Marco58c57942017-11-28 09:10:03 +00001295 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +00001296
1297 uchar16 res = convert_uchar16_sat(input_values);
1298
1299#if defined(MIN_BOUND)
1300 res = max(res, (uchar16)MIN_BOUND);
1301#endif // defined(MIN_BOUND)
1302#if defined(MAX_BOUND)
1303 res = min(res, (uchar16)MAX_BOUND);
1304#endif // defined(MAX_BOUND)
1305
1306 // Store the result
1307 vstore16(res, 0, dst.ptr);
1308}
Gian Marco58c57942017-11-28 09:10:03 +00001309#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1310
1311#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
1312/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1313 *
1314 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
1315 * The following computations will be performed by the kernel:
1316 *
1317 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1318 * -# Add bias to final result if bias tensor is not a nullptr
1319 * -# Round to nearest division by a power-of-two using result_shift
1320 * -# Add offset to each result
1321 * -# Clamp the value between the specified min and max bounds
1322 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1323 *
1324 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1325 *
1326 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1327 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1328 * These values can be used to implement "rectified linear unit" activation functions
1329 *
1330 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1331 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1332 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1333 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1334 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1335 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1336 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1337 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1338 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
1339 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
1340 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
1341 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
1342 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1343 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1344 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1345 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1346 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1347 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1348 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1349 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1350 */
1351__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
1352#if defined(ADD_BIAS)
1353 VECTOR_DECLARATION(biases),
1354#endif // defined(ADD_BIAS)
1355 TENSOR3D_DECLARATION(dst))
1356{
1357 // Compute source and destination addresses
1358 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1359 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
1360#if defined(ADD_BIAS)
1361 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1362#endif // defined(ADD_BIAS)
1363
1364 int16 input_values = vload16(0, (__global int *)src.ptr);
1365
1366#if defined(ADD_BIAS)
1367 // Add bias
1368 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
1369 input_values += (int16)biases_values;
1370#endif // defined(ADD_BIAS)
1371
1372 // Multiply by result_mult_int and shift
1373 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
1374
1375 // Add the offset terms to GEMM's result
1376 input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
1377
1378 uchar16 res = convert_uchar16_sat(input_values);
1379
1380#if defined(MIN_BOUND)
1381 res = max(res, (uchar16)MIN_BOUND);
1382#endif // defined(MIN_BOUND)
1383#if defined(MAX_BOUND)
1384 res = min(res, (uchar16)MAX_BOUND);
1385#endif // defined(MAX_BOUND)
1386
1387 // Store the result
1388 vstore16(res, 0, dst.ptr);
1389}
Chunosov5124be52017-11-22 20:42:13 +07001390#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)