Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

blob: bad09f3c427930c17a4502e0ac8c6d9c63d54133 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
Gian Marco Iodice	368da83	2017-07-03 12:33:49 +0100	[diff] [blame]	26	#ifdef FIXED_POINT_POSITION
				27	#include "fixed_point.h"
				28	#endif // FIXED_POINT_POSITION
				29
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	30	#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
				31
				32	#if TRANSPOSE_W == 4
				33	#define DATA_TYPE uint
				34	#elif TRANSPOSE_W == 8
				35	#define DATA_TYPE ushort
				36	#elif TRANSPOSE_W == 16
				37	#define DATA_TYPE uchar
				38	#else // TRANSPOSE_W == 16
				39	#error "Transpose width not supported"
				40	#endif // TRANSPOSE_W
				41
				42	/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	43	*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	44	* @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
				45	*
				46	* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	47	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				48	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				49	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				50	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				51	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	52	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	53	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	54	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	56	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	57	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				58	*/
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	59	__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
Gian Marco Iodice	9f89bae	2017-06-22 12:09:49 +0100	[diff] [blame]	60	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	61	{
				62	uint x = get_global_id(0);
				63	uint y = get_global_id(1);
				64
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	65	// Compute address for Matrix B - source
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	66	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				67
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	68	// Compute address for Matrix B transposed - destination. X and Y are swapped
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	69	uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
				70	(x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	71
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	72	VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
				73	b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	74
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	75	VSTORE(TRANSPOSE_W)
				76	(b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	77	}
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	78	#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	80	#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
				81
				82	/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	83	*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	84	* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	85	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				86	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				87	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				88	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				89	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	90	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	91	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				92	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				93	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				94	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				95	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				96	*/
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	97	__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
Gian Marco Iodice	9f89bae	2017-06-22 12:09:49 +0100	[diff] [blame]	98	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	99	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	100	// Compute source and destination addresses
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	101	uint x = get_global_id(0);
				102	uint y = get_global_id(1);
				103
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	104	// Compute address for Matrix B - source
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	105	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				106
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	107	// Compute address for Matrix B transposed - destination. X and Y are swapped
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	108	uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
				109	(y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	110
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	111	// Load values from Matrix A
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	112	VEC_DATA_TYPE(DATA_TYPE, 4)
				113	a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
				114	VEC_DATA_TYPE(DATA_TYPE, 4)
				115	a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
				116	VEC_DATA_TYPE(DATA_TYPE, 4)
				117	a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
				118	VEC_DATA_TYPE(DATA_TYPE, 4)
				119	a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	120
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	121	VEC_DATA_TYPE(DATA_TYPE, 4)
				122	val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
				123	vstore4(val0, 0, ((__global DATA_TYPE )(dst_ptr + dst_addr_in_bytes) + 0 MULT_INTERLEAVE4X4_HEIGHT));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	125	val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
				126	vstore4(val0, 0, ((__global DATA_TYPE )(dst_ptr + dst_addr_in_bytes) + 4 MULT_INTERLEAVE4X4_HEIGHT));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	127
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	128	val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
				129	vstore4(val0, 0, ((__global DATA_TYPE )(dst_ptr + dst_addr_in_bytes) + 8 MULT_INTERLEAVE4X4_HEIGHT));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	130
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	131	val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
				132	vstore4(val0, 0, ((__global DATA_TYPE )(dst_ptr + dst_addr_in_bytes) + 12 MULT_INTERLEAVE4X4_HEIGHT));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133	}
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	134	#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	135
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	136	#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	137	/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	138	* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	139	*
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	140	* @attention The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	141	*
				142	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
				143	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				144	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				145	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				146	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				147	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	148	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				150	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				151	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				152	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				153	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	154	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	155	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	156	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	157	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	158	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				160	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	161	__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
				162	IMAGE_DECLARATION(src1),
				163	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	164	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	165	int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
				166	int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	167
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	168	// Offset
				169	const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
				170	const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	171
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	172	// src_addr_a = address of matrix A
				173	// src_addr_b = address of matrix B
				174	__global float src_addr_a = (__global float )(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
				175	__global float src_addr_b = (__global float )(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	176
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	177	// Compute end row address for matrix B
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	178	__global float *src_end_addr_b = src_addr_b + COLS_B;
				179
				180	src_addr_a += offset_row_a;
				181	src_addr_b += offset_row_b;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	182
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	183	// Reset accumulators
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	184	float4 c00 = 0.0f;
				185	float4 c10 = 0.0f;
				186	float4 c20 = 0.0f;
				187	float4 c30 = 0.0f;
				188
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	189	for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	190	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	191	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	192	float4 a0 = vload4(0, src_addr_a);
				193	float4 b0 = vload4(0, src_addr_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	194
				195	c00 += (float4)a0.s0 * b0;
				196	c10 += (float4)a0.s1 * b0;
				197	c20 += (float4)a0.s2 * b0;
				198	c30 += (float4)a0.s3 * b0;
				199
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	200	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	201	a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
				202	b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	203
				204	c00 += (float4)a0.s0 * b0;
				205	c10 += (float4)a0.s1 * b0;
				206	c20 += (float4)a0.s2 * b0;
				207	c30 += (float4)a0.s3 * b0;
				208	}
				209
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	210	for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	211	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	212	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	213	float4 a0 = vload4(0, src_addr_a);
				214	float4 b0 = vload4(0, src_addr_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	215
				216	c00 += (float4)a0.s0 * b0;
				217	c10 += (float4)a0.s1 * b0;
				218	c20 += (float4)a0.s2 * b0;
				219	c30 += (float4)a0.s3 * b0;
				220	}
				221
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	222	// Compute destination address
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				224
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	225	#if defined(ALPHA)
				226	// Multiply by the weight of matrix product
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	227	c00 = c00 * (float4)ALPHA;
				228	c10 = c10 * (float4)ALPHA;
				229	c20 = c20 * (float4)ALPHA;
				230	c30 = c30 * (float4)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	231	#endif // defined(ALPHA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	232
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	233	// Store 4x4 block
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
				235	vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
				236	vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
				237	vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
				238	}
				239
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	240	/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	241	* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	242	*
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	243	* @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	244	*
				245	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
				246	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				247	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				248	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				249	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				250	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	251	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	252	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				253	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				254	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				255	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				256	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	257	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	258	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	259	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	260	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	261	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	262	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				263	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	264	__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
				265	IMAGE_DECLARATION(src1),
				266	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	267	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	268	int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
				269	int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
				270
				271	// Offset
				272	const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
				273	const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
				274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	275	// src_addr_a = address of matrix A
				276	// src_addr_b = address of matrix B
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	277	__global float src_addr_a = (__global float )(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
				278	__global float src_addr_b = (__global float )(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	279
				280	// Compute end row address for matrix B
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	281	__global float *src_end_addr_b = src_addr_b + COLS_B;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	282
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	283	src_addr_a += offset_row_a;
				284	src_addr_b += offset_row_b;
				285
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	286	// Reset accumulators
				287	float c00 = 0.0f;
				288	float c01 = 0.0f;
				289	float c02 = 0.0f;
				290	float c03 = 0.0f;
				291	float c10 = 0.0f;
				292	float c11 = 0.0f;
				293	float c12 = 0.0f;
				294	float c13 = 0.0f;
				295	float c20 = 0.0f;
				296	float c21 = 0.0f;
				297	float c22 = 0.0f;
				298	float c23 = 0.0f;
				299	float c30 = 0.0f;
				300	float c31 = 0.0f;
				301	float c32 = 0.0f;
				302	float c33 = 0.0f;
				303
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	304	for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	305	{
				306	// Load values from matrix A (interleaved) and matrix B (transposed)
				307	float4 a0 = vload4(0, src_addr_a);
				308	float4 b0 = vload4(0, src_addr_b);
				309
				310	c00 = fma(a0.s0, b0.s0, c00);
				311	c01 = fma(a0.s0, b0.s1, c01);
				312	c02 = fma(a0.s0, b0.s2, c02);
				313	c03 = fma(a0.s0, b0.s3, c03);
				314
				315	c10 = fma(a0.s1, b0.s0, c10);
				316	c11 = fma(a0.s1, b0.s1, c11);
				317	c12 = fma(a0.s1, b0.s2, c12);
				318	c13 = fma(a0.s1, b0.s3, c13);
				319
				320	c20 = fma(a0.s2, b0.s0, c20);
				321	c21 = fma(a0.s2, b0.s1, c21);
				322	c22 = fma(a0.s2, b0.s2, c22);
				323	c23 = fma(a0.s2, b0.s3, c23);
				324
				325	c30 = fma(a0.s3, b0.s0, c30);
				326	c31 = fma(a0.s3, b0.s1, c31);
				327	c32 = fma(a0.s3, b0.s2, c32);
				328	c33 = fma(a0.s3, b0.s3, c33);
				329
				330	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	331	a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
				332	b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	333
				334	c00 = fma(a0.s0, b0.s0, c00);
				335	c01 = fma(a0.s0, b0.s1, c01);
				336	c02 = fma(a0.s0, b0.s2, c02);
				337	c03 = fma(a0.s0, b0.s3, c03);
				338
				339	c10 = fma(a0.s1, b0.s0, c10);
				340	c11 = fma(a0.s1, b0.s1, c11);
				341	c12 = fma(a0.s1, b0.s2, c12);
				342	c13 = fma(a0.s1, b0.s3, c13);
				343
				344	c20 = fma(a0.s2, b0.s0, c20);
				345	c21 = fma(a0.s2, b0.s1, c21);
				346	c22 = fma(a0.s2, b0.s2, c22);
				347	c23 = fma(a0.s2, b0.s3, c23);
				348
				349	c30 = fma(a0.s3, b0.s0, c30);
				350	c31 = fma(a0.s3, b0.s1, c31);
				351	c32 = fma(a0.s3, b0.s2, c32);
				352	c33 = fma(a0.s3, b0.s3, c33);
				353
				354	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	355	a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
				356	b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	357
				358	c00 = fma(a0.s0, b0.s0, c00);
				359	c01 = fma(a0.s0, b0.s1, c01);
				360	c02 = fma(a0.s0, b0.s2, c02);
				361	c03 = fma(a0.s0, b0.s3, c03);
				362
				363	c10 = fma(a0.s1, b0.s0, c10);
				364	c11 = fma(a0.s1, b0.s1, c11);
				365	c12 = fma(a0.s1, b0.s2, c12);
				366	c13 = fma(a0.s1, b0.s3, c13);
				367
				368	c20 = fma(a0.s2, b0.s0, c20);
				369	c21 = fma(a0.s2, b0.s1, c21);
				370	c22 = fma(a0.s2, b0.s2, c22);
				371	c23 = fma(a0.s2, b0.s3, c23);
				372
				373	c30 = fma(a0.s3, b0.s0, c30);
				374	c31 = fma(a0.s3, b0.s1, c31);
				375	c32 = fma(a0.s3, b0.s2, c32);
				376	c33 = fma(a0.s3, b0.s3, c33);
				377
				378	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	379	a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
				380	b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	381
				382	c00 = fma(a0.s0, b0.s0, c00);
				383	c01 = fma(a0.s0, b0.s1, c01);
				384	c02 = fma(a0.s0, b0.s2, c02);
				385	c03 = fma(a0.s0, b0.s3, c03);
				386
				387	c10 = fma(a0.s1, b0.s0, c10);
				388	c11 = fma(a0.s1, b0.s1, c11);
				389	c12 = fma(a0.s1, b0.s2, c12);
				390	c13 = fma(a0.s1, b0.s3, c13);
				391
				392	c20 = fma(a0.s2, b0.s0, c20);
				393	c21 = fma(a0.s2, b0.s1, c21);
				394	c22 = fma(a0.s2, b0.s2, c22);
				395	c23 = fma(a0.s2, b0.s3, c23);
				396
				397	c30 = fma(a0.s3, b0.s0, c30);
				398	c31 = fma(a0.s3, b0.s1, c31);
				399	c32 = fma(a0.s3, b0.s2, c32);
				400	c33 = fma(a0.s3, b0.s3, c33);
				401	}
				402
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	403	for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	404	{
				405	// Load values from matrix A (interleaved) and matrix B (transposed)
				406	float4 a0 = vload4(0, src_addr_a);
				407	float4 b0 = vload4(0, src_addr_b);
				408
				409	c00 = fma(a0.s0, b0.s0, c00);
				410	c01 = fma(a0.s0, b0.s1, c01);
				411	c02 = fma(a0.s0, b0.s2, c02);
				412	c03 = fma(a0.s0, b0.s3, c03);
				413
				414	c10 = fma(a0.s1, b0.s0, c10);
				415	c11 = fma(a0.s1, b0.s1, c11);
				416	c12 = fma(a0.s1, b0.s2, c12);
				417	c13 = fma(a0.s1, b0.s3, c13);
				418
				419	c20 = fma(a0.s2, b0.s0, c20);
				420	c21 = fma(a0.s2, b0.s1, c21);
				421	c22 = fma(a0.s2, b0.s2, c22);
				422	c23 = fma(a0.s2, b0.s3, c23);
				423
				424	c30 = fma(a0.s3, b0.s0, c30);
				425	c31 = fma(a0.s3, b0.s1, c31);
				426	c32 = fma(a0.s3, b0.s2, c32);
				427	c33 = fma(a0.s3, b0.s3, c33);
				428	}
				429
				430	// Compute destination address
				431	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				432
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	433	#if defined(ALPHA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	434	// Multiply by the weight of matrix product
				435	c00 = c00 * ALPHA;
				436	c01 = c01 * ALPHA;
				437	c02 = c02 * ALPHA;
				438	c03 = c03 * ALPHA;
				439	c10 = c10 * ALPHA;
				440	c11 = c11 * ALPHA;
				441	c12 = c12 * ALPHA;
				442	c13 = c13 * ALPHA;
				443	c20 = c20 * ALPHA;
				444	c21 = c21 * ALPHA;
				445	c22 = c22 * ALPHA;
				446	c23 = c23 * ALPHA;
				447	c30 = c30 * ALPHA;
				448	c31 = c31 * ALPHA;
				449	c32 = c32 * ALPHA;
				450	c33 = c33 * ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	451	#endif // defined(ALPHA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	452
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	453	// Store 4x4 block
				454	vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
				455	vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
				456	vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
				457	vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
				458	}
				459
Matthew Bentham	6f31f8c	2017-10-27 11:50:06 +0100	[diff] [blame]	460	#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	461	/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	462	* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	463	*
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	464	* @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	465	*
				466	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
				467	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				468	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				469	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				470	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				471	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	472	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	473	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				474	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				475	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				476	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				477	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	478	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	479	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	480	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	481	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	482	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	483	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				484	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	485	__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
				486	IMAGE_DECLARATION(src1),
				487	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	488	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	489	int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
				490	int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	491
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	492	// Offset
				493	const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
				494	const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	495
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	496	// src_addr_a = address of matrix A
				497	// src_addr_b = address of matrix B
				498	__global half src_addr_a = (__global half )(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
				499	__global half src_addr_b = (__global half )(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	500
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	501	// Compute end row address for matrix B
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	502	__global half *src_end_addr_b = src_addr_b + COLS_B;
				503
				504	src_addr_a += offset_row_a;
				505	src_addr_b += offset_row_b;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	506
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	507	// Reset accumulators
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	508	half8 c00 = 0.0f;
				509	half8 c10 = 0.0f;
				510	half8 c20 = 0.0f;
				511	half8 c30 = 0.0f;
				512
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	513	for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	514	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	515	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	516	half4 a0 = vload4(0, src_addr_a);
				517	half8 b0 = vload8(0, src_addr_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	518
				519	c00 += (half8)a0.s0 * b0;
				520	c10 += (half8)a0.s1 * b0;
				521	c20 += (half8)a0.s2 * b0;
				522	c30 += (half8)a0.s3 * b0;
				523
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	524	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	525	a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
				526	b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	527
				528	c00 += (half8)a0.s0 * b0;
				529	c10 += (half8)a0.s1 * b0;
				530	c20 += (half8)a0.s2 * b0;
				531	c30 += (half8)a0.s3 * b0;
				532	}
				533
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	534	for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	535	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	536	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	537	half4 a0 = vload4(0, src_addr_a);
				538	half8 b0 = vload8(0, src_addr_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	539
				540	c00 += (half8)a0.s0 * b0;
				541	c10 += (half8)a0.s1 * b0;
				542	c20 += (half8)a0.s2 * b0;
				543	c30 += (half8)a0.s3 * b0;
				544	}
				545
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	546	// Compute destination address
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	547	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				548
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	549	#if defined(ALPHA)
				550	// Multiply by the weight of matrix product
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	551	c00 = c00 * (half8)ALPHA;
				552	c10 = c10 * (half8)ALPHA;
				553	c20 = c20 * (half8)ALPHA;
				554	c30 = c30 * (half8)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	555	#endif // defined(ALPHA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	556
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	557	// Store 4x8 block
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	558	vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
				559	vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
				560	vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
				561	vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
				562	}
Matthew Bentham	6f31f8c	2017-10-27 11:50:06 +0100	[diff] [blame]	563	#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	564
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	565	#if defined(FIXED_POINT_POSITION)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	566	/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
				567	* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
				568	*
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	569	* @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	570	*
				571	* @note: ALPHA must be passed in 8 bit fixed point format
				572	*
				573	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
				574	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				575	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				576	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				577	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				578	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				579	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				580	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				581	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				582	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				583	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				584	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				585	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				586	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	587	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	588	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	589	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	590	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				591	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	592	__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
				593	IMAGE_DECLARATION(src1),
				594	IMAGE_DECLARATION(dst))
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	595	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	596	int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
				597	int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	598
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	599	// Offset
				600	const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
				601	const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	602
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	603	// src_addr_a = address of matrix A
				604	// src_addr_b = address of matrix B
				605	__global char src_addr_a = src0_ptr + y src0_stride_y + src0_offset_first_element_in_bytes;
				606	__global char src_addr_b = src1_ptr + x src1_stride_y + src1_offset_first_element_in_bytes;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	607
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	608	// Compute end row address for matrix B
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	609	__global char *src_end_addr_b = src_addr_b + COLS_B;
				610
				611	src_addr_a += offset_row_a;
				612	src_addr_b += offset_row_b;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	613
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	614	// Reset accumulators
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	615	short8 c00 = 0.0f;
				616	short8 c10 = 0.0f;
				617	short8 c20 = 0.0f;
				618	short8 c30 = 0.0f;
				619	short8 c01 = 0.0f;
				620	short8 c11 = 0.0f;
				621	short8 c21 = 0.0f;
				622	short8 c31 = 0.0f;
				623
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	624	// This for loop performs 1 accumulation for each iteration
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	625	for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	626	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	627	// Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	628	char4 a0 = vload4(0, src_addr_a);
				629	char16 b0 = vload16(0, src_addr_b);
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	630
				631	c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
				632	c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
				633	c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
				634	c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
				635
				636	c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				637	c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
				638	c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
				639	c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
				640	}
				641
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	642	// Compute destination address
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	643	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				644
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	645	// Multiply by the weight of matrix product
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	646	char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
				647	char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
				648	char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
				649	char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
				650
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	651	#if defined(ALPHA)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	652	c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
				653	c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
				654	c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
				655	c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	656	#endif // defined(ALPHA)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	657
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	658	// Store 16x4 block
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	659	vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
				660	vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
				661	vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
				662	vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
				663	}
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	664
				665	/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
				666	* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
				667	*
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	668	* @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	669	*
				670	* @note: ALPHA must be passed in 16 bit fixed point format
				671	*
				672	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
				673	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				674	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				675	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				676	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				677	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				678	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				679	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				680	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				681	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				682	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				683	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				684	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				685	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	686	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	687	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	688	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	689	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				690	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	691	__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
				692	IMAGE_DECLARATION(src1),
				693	IMAGE_DECLARATION(dst))
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	694	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	695	int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
				696	int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	697
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	698	// Offset
				699	const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
				700	const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	701
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	702	// src_addr_a = address of matrix A
				703	// src_addr_b = address of matrix B
				704	__global short src_addr_a = (__global short )(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
				705	__global short src_addr_b = (__global short )(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	706
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	707	// Compute end row address for matrix B
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	708	__global short *src_end_addr_b = src_addr_b + COLS_B;
				709
				710	src_addr_a += offset_row_a;
				711	src_addr_b += offset_row_b;
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	712
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	713	// Reset accumulators
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	714	int8 c00 = 0.0f;
				715	int8 c10 = 0.0f;
				716	int8 c20 = 0.0f;
				717	int8 c30 = 0.0f;
				718
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	719	// This for loop performs 1 accumulation for each iteration
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	720	for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	721	{
				722	/* Load values from matrix A (interleaved) and matrix B (transposed) */
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	723	short4 a0 = vload4(0, src_addr_a);
				724	short8 b0 = vload8(0, src_addr_b);
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	725
				726	c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
				727	c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
				728	c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
				729	c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
				730	}
				731
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	732	// Compute destination address
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	733	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				734
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	735	// Multiply by the weight of matrix product
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	736	short8 c00_qs16 = convert_short8_sat(c00);
				737	short8 c10_qs16 = convert_short8_sat(c10);
				738	short8 c20_qs16 = convert_short8_sat(c20);
				739	short8 c30_qs16 = convert_short8_sat(c30);
				740
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	741	#if defined(ALPHA)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	742	c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
				743	c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
				744	c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
				745	c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	746	#endif // defined(ALPHA)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	747
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	748	// Store 8x4 block
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	749	vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
				750	vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
				751	vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
				752	vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
				753	}
				754	#endif // defined(FIXED_POINT_POSITION)
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame^]	755	#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	756
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	757	#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
				758	#if defined(DATA_TYPE)
				759	#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
				760	/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	761	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	762	* @note This OpenCL kernel works with floating point data types (F16/F32)
				763	* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
				764	* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	765	* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	766	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	767	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	768	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				769	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				770	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				771	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				772	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	773	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	774	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				775	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				776	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				777	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				778	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	779	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	780	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				781	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				782	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				783	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				784	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				785	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	786	__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
				787	IMAGE_DECLARATION(src1),
				788	IMAGE_DECLARATION(dst))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	789	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	790	int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	791
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	792	// Compute starting address for matrix A and Matrix B
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	793	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	794
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	795	// Update address for the matrix A
				796	src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	797
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	798	// Update address for the matrix B
				799	src_addr.s1 += idx * sizeof(DATA_TYPE);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	800
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	801	int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
				802
				803	VECTOR_TYPE acc0 = 0.0f;
				804	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				805	VECTOR_TYPE acc1 = 0.0f;
				806	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				807	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				808	VECTOR_TYPE acc2 = 0.0f;
				809	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				810	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				811	VECTOR_TYPE acc3 = 0.0f;
				812	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				813
Georgios Pinitas	96880cf	2017-10-20 18:52:20 +0100	[diff] [blame]	814	for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	815	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	816	// Load values from matrix A
				817	VEC_DATA_TYPE(DATA_TYPE, 2)
				818	a0 = vload2(0, (__global DATA_TYPE )(src0_ptr + src_addr.s0 + 0 src0_stride_y));
				819	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				820	VEC_DATA_TYPE(DATA_TYPE, 2)
				821	a1 = vload2(0, (__global DATA_TYPE )(src0_ptr + src_addr.s0 + 1 src0_stride_y));
				822	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				823	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				824	VEC_DATA_TYPE(DATA_TYPE, 2)
				825	a2 = vload2(0, (__global DATA_TYPE )(src0_ptr + src_addr.s0 + 2 src0_stride_y));
				826	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				827	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				828	VEC_DATA_TYPE(DATA_TYPE, 2)
				829	a3 = vload2(0, (__global DATA_TYPE )(src0_ptr + src_addr.s0 + 3 src0_stride_y));
				830	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				831	// Load values from matrix B
				832	VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
				833	VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	834
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	835	// Accumulate
				836	acc0 += b0 * (VECTOR_TYPE)a0.s0;
				837	acc0 += b1 * (VECTOR_TYPE)a0.s1;
				838	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				839	acc1 += b0 * (VECTOR_TYPE)a1.s0;
				840	acc1 += b1 * (VECTOR_TYPE)a1.s1;
				841	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				842	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				843	acc2 += b0 * (VECTOR_TYPE)a2.s0;
				844	acc2 += b1 * (VECTOR_TYPE)a2.s1;
				845	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				846	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				847	acc3 += b0 * (VECTOR_TYPE)a3.s0;
				848	acc3 += b1 * (VECTOR_TYPE)a3.s1;
				849	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	850	}
				851
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	852	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	853	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	854	// Load values from matrix A
				855	DATA_TYPE a0 = ((__global DATA_TYPE )(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
				856	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				857	DATA_TYPE a1 = ((__global DATA_TYPE )(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
				858	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				859	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				860	DATA_TYPE a2 = ((__global DATA_TYPE )(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
				861	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				862	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				863	DATA_TYPE a3 = ((__global DATA_TYPE )(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
				864	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				865	// Load values from matrix B
				866	VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	867
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	868	// Accumulate
				869	acc0 += b0 * (VECTOR_TYPE)a0;
				870	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				871	acc1 += b0 * (VECTOR_TYPE)a1;
				872	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				873	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				874	acc2 += b0 * (VECTOR_TYPE)a2;
				875	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				876	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				877	acc3 += b0 * (VECTOR_TYPE)a3;
				878	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	879	}
				880
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	881	// Compute destination address
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	882	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				883
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	884	// Multiply by the weight of matrix-matrix product and store the result
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	885	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	886	acc0 = acc0 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	887	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	888	VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
				889	(acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
				890	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	891	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	892	acc1 = acc1 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	893	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	894	VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
				895	(acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
				896	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				897	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	898	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	899	acc2 = acc2 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	900	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	901	VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
				902	(acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
				903	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				904	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	905	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	906	acc3 = acc3 * (VECTOR_TYPE)ALPHA;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	907	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	908	VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
				909	(acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
				910	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	911	}
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	912	#endif // defined(DATA_TYPE)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	913
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	914	/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
				915	*
				916	* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
				917	* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
				918	* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
				919	* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
				920	* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
				921	*
				922	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
				923	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				924	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				925	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				926	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				927	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				928	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				929	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				930	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				931	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				932	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				933	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				934	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				935	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				936	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				937	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				938	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				939	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				940	*/
				941	__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
				942	IMAGE_DECLARATION(src1),
				943	IMAGE_DECLARATION(dst))
				944	{
				945	int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
				946
				947	// Compute starting address for matrix A and matrix B
				948	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
				949
				950	// Update address for matrix A
				951	src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
				952
				953	// Update address for matrix B
				954	src_addr.s1 += idx * sizeof(float);
				955
				956	// Address boundary for matrix A
				957	int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
				958
				959	// Initialize accumulators
				960	float acc00 = 0.0f;
				961	float acc01 = 0.0f;
				962	float acc02 = 0.0f;
				963	float acc03 = 0.0f;
				964
				965	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				966	float acc10 = 0.0f;
				967	float acc11 = 0.0f;
				968	float acc12 = 0.0f;
				969	float acc13 = 0.0f;
				970	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				971
				972	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				973	float acc20 = 0.0f;
				974	float acc21 = 0.0f;
				975	float acc22 = 0.0f;
				976	float acc23 = 0.0f;
				977	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				978
				979	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				980	float acc30 = 0.0f;
				981	float acc31 = 0.0f;
				982	float acc32 = 0.0f;
				983	float acc33 = 0.0f;
				984	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				985
				986	// A and B src indices get incremented at the same time.
				987	for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
				988	{
				989	// Load values from matrix A
				990	float2 a0 = vload2(0, (__global float )(src0_ptr + src_addr.s0 + 0 src0_stride_y));
				991	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				992	float2 a1 = vload2(0, (__global float )(src0_ptr + src_addr.s0 + 1 src0_stride_y));
				993	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				994	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				995	float2 a2 = vload2(0, (__global float )(src0_ptr + src_addr.s0 + 2 src0_stride_y));
				996	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				997	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				998	float2 a3 = vload2(0, (__global float )(src0_ptr + src_addr.s0 + 3 src0_stride_y));
				999	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1000	// Load values from matrix B
				1001	float4 b0 = vload4(0, (__global float )(src1_ptr + src_addr.s1 + 0 src1_stride_y));
				1002	float4 b1 = vload4(0, (__global float )(src1_ptr + src_addr.s1 + 1 src1_stride_y));
				1003
				1004	// Multiply and accumulate
				1005	acc00 = fma(a0.s0, b0.s0, acc00);
				1006	acc00 = fma(a0.s1, b1.s0, acc00);
				1007	acc01 = fma(a0.s0, b0.s1, acc01);
				1008	acc01 = fma(a0.s1, b1.s1, acc01);
				1009	acc02 = fma(a0.s0, b0.s2, acc02);
				1010	acc02 = fma(a0.s1, b1.s2, acc02);
				1011	acc03 = fma(a0.s1, b1.s3, acc03);
				1012	acc03 = fma(a0.s0, b0.s3, acc03);
				1013
				1014	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1015	acc10 = fma(a1.s0, b0.s0, acc10);
				1016	acc11 = fma(a1.s0, b0.s1, acc11);
				1017	acc12 = fma(a1.s0, b0.s2, acc12);
				1018	acc13 = fma(a1.s0, b0.s3, acc13);
				1019
				1020	acc10 = fma(a1.s1, b1.s0, acc10);
				1021	acc11 = fma(a1.s1, b1.s1, acc11);
				1022	acc12 = fma(a1.s1, b1.s2, acc12);
				1023	acc13 = fma(a1.s1, b1.s3, acc13);
				1024	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1025	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1026	acc20 = fma(a2.s0, b0.s0, acc20);
				1027	acc21 = fma(a2.s0, b0.s1, acc21);
				1028	acc22 = fma(a2.s0, b0.s2, acc22);
				1029	acc23 = fma(a2.s0, b0.s3, acc23);
				1030
				1031	acc20 = fma(a2.s1, b1.s0, acc20);
				1032	acc21 = fma(a2.s1, b1.s1, acc21);
				1033	acc22 = fma(a2.s1, b1.s2, acc22);
				1034	acc23 = fma(a2.s1, b1.s3, acc23);
				1035	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1036	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1037	acc30 = fma(a3.s0, b0.s0, acc30);
				1038	acc31 = fma(a3.s0, b0.s1, acc31);
				1039	acc32 = fma(a3.s0, b0.s2, acc32);
				1040	acc33 = fma(a3.s0, b0.s3, acc33);
				1041
				1042	acc30 = fma(a3.s1, b1.s0, acc30);
				1043	acc31 = fma(a3.s1, b1.s1, acc31);
				1044	acc32 = fma(a3.s1, b1.s2, acc32);
				1045	acc33 = fma(a3.s1, b1.s3, acc33);
				1046	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1047	}
				1048
				1049	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
				1050	{
				1051	// Load values from matrix A
				1052	float a0 = ((__global float )(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
				1053	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1054	float a1 = ((__global float )(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
				1055	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1056	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1057	float a2 = ((__global float )(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
				1058	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1059	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1060	float a3 = ((__global float )(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
				1061	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1062	// Load values from matrix B
				1063	float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
				1064
				1065	// Multiply and accumulate
				1066	acc00 = fma(a0, b0.s0, acc00);
				1067	acc01 = fma(a0, b0.s1, acc01);
				1068	acc02 = fma(a0, b0.s2, acc02);
				1069	acc03 = fma(a0, b0.s3, acc03);
				1070	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1071	acc10 = fma(a1, b0.s0, acc10);
				1072	acc11 = fma(a1, b0.s1, acc11);
				1073	acc12 = fma(a1, b0.s2, acc12);
				1074	acc13 = fma(a1, b0.s3, acc13);
				1075	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1076	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1077	acc20 = fma(a2, b0.s0, acc20);
				1078	acc21 = fma(a2, b0.s1, acc21);
				1079	acc22 = fma(a2, b0.s2, acc22);
				1080	acc23 = fma(a2, b0.s3, acc23);
				1081	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1082	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1083	acc30 = fma(a3, b0.s0, acc30);
				1084	acc31 = fma(a3, b0.s1, acc31);
				1085	acc32 = fma(a3, b0.s2, acc32);
				1086	acc33 = fma(a3, b0.s3, acc33);
				1087	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1088	}
				1089
				1090	// Compute destination address
				1091	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1092
				1093	// Multiply by the weight of matrix-matrix product and store the result
				1094	#if defined(ALPHA)
				1095	acc00 = acc00 * ALPHA;
				1096	acc01 = acc01 * ALPHA;
				1097	acc02 = acc02 * ALPHA;
				1098	acc03 = acc03 * ALPHA;
				1099	#endif // defined(ALPHA)
				1100
				1101	float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));
				1102	vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
				1103
				1104	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1105	#if defined(ALPHA)
				1106	acc10 = acc10 * ALPHA;
				1107	acc11 = acc11 * ALPHA;
				1108	acc12 = acc12 * ALPHA;
				1109	acc13 = acc13 * ALPHA;
				1110	#endif // defined(ALPHA)
				1111	float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));
				1112	vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
				1113	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1114	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1115	#if defined(ALPHA)
				1116	acc20 = acc20 * ALPHA;
				1117	acc21 = acc21 * ALPHA;
				1118	acc22 = acc22 * ALPHA;
				1119	acc23 = acc23 * ALPHA;
				1120	#endif // defined(ALPHA)
				1121	float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));
				1122	vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
				1123	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1124	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1125	#if defined(ALPHA)
				1126	acc30 = acc30 * ALPHA;
				1127	acc31 = acc31 * ALPHA;
				1128	acc32 = acc32 * ALPHA;
				1129	acc33 = acc33 * ALPHA;
				1130	#endif // defined(ALPHA)
				1131	float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));
				1132	vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
				1133	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1134	}
				1135
				1136	/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
				1137	*
				1138	* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
				1139	* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
				1140	* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
				1141	* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
				1142	* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
				1143	* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
				1144	*
				1145	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
				1146	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				1147	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1148	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				1149	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1150	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				1151	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				1152	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				1153	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1154	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				1155	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1156	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				1157	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				1158	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1159	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1160	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1161	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1162	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1163	*/
				1164	__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
				1165	IMAGE_DECLARATION(src1),
				1166	IMAGE_DECLARATION(dst))
				1167	{
				1168	// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1169	int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
				1170
				1171	// Compute starting address for matrix A and Matrix B
				1172	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
				1173
				1174	// Update address for the matrix A
				1175	src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
				1176
				1177	// Update address for the matrix B
				1178	src_addr.s1 += idx * sizeof(float);
				1179
				1180	// Address boundary for the matrix A
				1181	int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
				1182
				1183	// Initialize accumulators
				1184	float acc00 = 0.0f;
				1185	float acc01 = 0.0f;
				1186
				1187	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1188	float acc10 = 0.0f;
				1189	float acc11 = 0.0f;
				1190	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1191	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1192	float acc20 = 0.0f;
				1193	float acc21 = 0.0f;
				1194	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1195	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1196	float acc30 = 0.0f;
				1197	float acc31 = 0.0f;
				1198	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1199
				1200	// A and B src indices get incremented at the same time.
				1201	for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))
				1202	{
				1203	// Load values from matrix A
				1204	float4 a0 = vload4(0, (__global float )(src0_ptr + src_addr.s0 + 0 src0_stride_y));
				1205
				1206	// Load values from matrix B
				1207	float2 b0 = vload2(0, (__global float )(src1_ptr + src_addr.s1 + 0 src1_stride_y));
				1208	float2 b1 = vload2(0, (__global float )(src1_ptr + src_addr.s1 + 1 src1_stride_y));
				1209	float2 b2 = vload2(0, (__global float )(src1_ptr + src_addr.s1 + 2 src1_stride_y));
				1210	float2 b3 = vload2(0, (__global float )(src1_ptr + src_addr.s1 + 3 src1_stride_y));
				1211
				1212	// Multiply and accumulate
				1213	acc00 = fma(a0.s0, b0.s0, acc00);
				1214	acc00 = fma(a0.s1, b1.s0, acc00);
				1215	acc00 = fma(a0.s2, b2.s0, acc00);
				1216	acc00 = fma(a0.s3, b3.s0, acc00);
				1217
				1218	acc01 = fma(a0.s0, b0.s1, acc01);
				1219	acc01 = fma(a0.s1, b1.s1, acc01);
				1220	acc01 = fma(a0.s2, b2.s1, acc01);
				1221	acc01 = fma(a0.s3, b3.s1, acc01);
				1222
				1223	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1224	a0 = vload4(0, (__global float )(src0_ptr + src_addr.s0 + 1 src0_stride_y));
				1225	acc10 = fma(a0.s0, b0.s0, acc10);
				1226	acc10 = fma(a0.s1, b1.s0, acc10);
				1227	acc10 = fma(a0.s2, b2.s0, acc10);
				1228	acc10 = fma(a0.s3, b3.s0, acc10);
				1229
				1230	acc11 = fma(a0.s0, b0.s1, acc11);
				1231	acc11 = fma(a0.s1, b1.s1, acc11);
				1232	acc11 = fma(a0.s2, b2.s1, acc11);
				1233	acc11 = fma(a0.s3, b3.s1, acc11);
				1234	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1235	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1236	a0 = vload4(0, (__global float )(src0_ptr + src_addr.s0 + 2 src0_stride_y));
				1237	acc20 = fma(a0.s0, b0.s0, acc20);
				1238	acc20 = fma(a0.s1, b1.s0, acc20);
				1239	acc20 = fma(a0.s2, b2.s0, acc20);
				1240	acc20 = fma(a0.s3, b3.s0, acc20);
				1241
				1242	acc21 = fma(a0.s0, b0.s1, acc21);
				1243	acc21 = fma(a0.s1, b1.s1, acc21);
				1244	acc21 = fma(a0.s2, b2.s1, acc21);
				1245	acc21 = fma(a0.s3, b3.s1, acc21);
				1246	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1247	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1248	a0 = vload4(0, (__global float )(src0_ptr + src_addr.s0 + 3 src0_stride_y));
				1249	acc30 = fma(a0.s0, b0.s0, acc30);
				1250	acc30 = fma(a0.s1, b1.s0, acc30);
				1251	acc30 = fma(a0.s2, b2.s0, acc30);
				1252	acc30 = fma(a0.s3, b3.s0, acc30);
				1253
				1254	acc31 = fma(a0.s0, b0.s1, acc31);
				1255	acc31 = fma(a0.s1, b1.s1, acc31);
				1256	acc31 = fma(a0.s2, b2.s1, acc31);
				1257	acc31 = fma(a0.s3, b3.s1, acc31);
				1258	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1259	}
				1260	// float size increment
				1261	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))
				1262	{
				1263	// Load values from matrix A
				1264	float a0 = ((__global float )(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
				1265	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1266	float a1 = ((__global float )(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
				1267	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1268	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1269	float a2 = ((__global float )(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
				1270	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1271	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1272	float a3 = ((__global float )(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
				1273	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1274	// Load values from matrix B
				1275	float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
				1276
				1277	// Multiply and accumulate
				1278	acc00 = fma(a0, b0.s0, acc00);
				1279	acc01 = fma(a0, b0.s1, acc01);
				1280	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1281	acc10 = fma(a1, b0.s0, acc10);
				1282	acc11 = fma(a1, b0.s1, acc11);
				1283	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1284	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1285	acc20 = fma(a2, b0.s0, acc20);
				1286	acc21 = fma(a2, b0.s1, acc21);
				1287	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1288	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1289	acc30 = fma(a3, b0.s0, acc30);
				1290	acc31 = fma(a3, b0.s1, acc31);
				1291	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1292	}
				1293
				1294	// Compute destination address
				1295	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1296
				1297	// Multiply by the weight of matrix-matrix product and store the result
				1298	#if defined(ALPHA)
				1299	acc00 = acc00 * ALPHA;
				1300	acc01 = acc01 * ALPHA;
				1301	#endif // defined(ALPHA)
				1302	float2 acc0 = ((float2)(acc00, acc01));
				1303	vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
				1304	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1305	#if defined(ALPHA)
				1306	acc10 = acc10 * ALPHA;
				1307	acc11 = acc11 * ALPHA;
				1308	#endif // defined(ALPHA)
				1309	float2 acc1 = ((float2)(acc10, acc11));
				1310	vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
				1311	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1312	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1313	#if defined(ALPHA)
				1314	acc20 = acc20 * ALPHA;
				1315	acc21 = acc21 * ALPHA;
				1316	#endif // defined(ALPHA)
				1317	float2 acc2 = ((float2)(acc20, acc21));
				1318	vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
				1319	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1320	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1321	#if defined(ALPHA)
				1322	acc30 = acc30 * ALPHA;
				1323	acc31 = acc31 * ALPHA;
				1324	#endif // defined(ALPHA)
				1325	float2 acc3 = (float2)(acc30, acc31);
				1326	vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
				1327	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1328	}
				1329
				1330	#if defined(FIXED_POINT_POSITION)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1331	/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1332	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1333	* @note This OpenCL kernel works with fixed point data types QS8
				1334	* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1335	* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1336	* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1337	* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1338	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1339	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1340	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				1341	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1342	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				1343	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1344	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				1345	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				1346	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				1347	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1348	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				1349	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1350	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				1351	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				1352	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1353	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1354	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1355	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1356	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1357	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1358	__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1359	IMAGE_DECLARATION(src1),
				1360	IMAGE_DECLARATION(dst))
				1361	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1362	int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1363
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1364	// Compute starting address for matrix A and Matrix B
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1365	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1366
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1367	// Update address for the matrix A
				1368	src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1369
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1370	// Update address for the matrix B
				1371	src_addr.s1 += idx * sizeof(char);
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1372
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1373	int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
				1374
				1375	short8 acc00 = 0;
				1376	short8 acc01 = 0;
				1377	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1378	short8 acc10 = 0;
				1379	short8 acc11 = 0;
				1380	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1381	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1382	short8 acc20 = 0;
				1383	short8 acc21 = 0;
				1384	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1385	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1386	short8 acc30 = 0;
				1387	short8 acc31 = 0;
				1388	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1389
				1390	// This for loop performs 4 accumulations per iteration
				1391	for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1392	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1393	char2 a0 = vload2(0, (__global char )(src0_ptr + src_addr.s0 + 0 src0_stride_y));
				1394	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1395	char2 a1 = vload2(0, (__global char )(src0_ptr + src_addr.s0 + 1 src0_stride_y));
				1396	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1397	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1398	char2 a2 = vload2(0, (__global char )(src0_ptr + src_addr.s0 + 2 src0_stride_y));
				1399	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1400	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1401	char2 a3 = vload2(0, (__global char )(src0_ptr + src_addr.s0 + 3 src0_stride_y));
				1402	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1403	char16 b0 = vload16(0, (__global char )(src1_ptr + src_addr.s1 + 0 src1_stride_y));
				1404	char16 b1 = vload16(0, (__global char )(src1_ptr + src_addr.s1 + 1 src1_stride_y));
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1405
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1406	acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
				1407	acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
				1408	acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1409	acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
				1410	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1411	acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);
				1412	acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);
				1413	acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1414	acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
				1415	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1416	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1417	acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);
				1418	acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);
				1419	acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1420	acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
				1421	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1422	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1423	acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);
				1424	acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);
				1425	acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1426	acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
				1427	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1428	}
				1429
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1430	// Left-over accumulations
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1431	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
				1432	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1433	char a0 = ((__global char )(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
				1434	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1435	char a1 = ((__global char )(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
				1436	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1437	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1438	char a2 = ((__global char )(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
				1439	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1440	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1441	char a3 = ((__global char )(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
				1442	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1443	char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
				1444
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1445	acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
				1446	acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1447	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1448	acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);
				1449	acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1450	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1451	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1452	acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);
				1453	acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1454	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1455	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1456	acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);
				1457	acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);
				1458	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1459	}
				1460
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1461	// Compute destination address
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1462	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1463
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1464	// Multiply by the weight of matrix product and store the result
				1465	char16 acc_qs8;
				1466	acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1467	#if defined(ALPHA)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1468	acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1469	#endif // defined(ALPHA)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1470	vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1471	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1472	acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1473	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1474	acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1475	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1476	vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
				1477	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1478	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1479	acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1480	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1481	acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1482	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1483	vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
				1484	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1485	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1486	acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1487	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1488	acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1489	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1490	vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
				1491	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1492	}
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1493
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1494	/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1495	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1496	* @note This OpenCL kernel works with fixed point data types QS16
				1497	* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1498	* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1499	* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1500	* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1501	*
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1502	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1503	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				1504	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1505	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				1506	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1507	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
				1508	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
				1509	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				1510	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1511	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				1512	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1513	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
				1514	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
				1515	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1516	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1517	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1518	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1519	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1520	*/
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1521	__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1522	IMAGE_DECLARATION(src1),
				1523	IMAGE_DECLARATION(dst))
				1524	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1525	int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1526
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1527	// Compute starting address for matrix A and Matrix B
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1528	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1529
				1530	// Update address for the matrix A
				1531	src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
				1532
				1533	// Update address for the matrix B
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1534	src_addr.s1 += idx * sizeof(short);
				1535
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1536	int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1537
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1538	int8 acc0 = 0;
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1539	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1540	int8 acc1 = 0;
				1541	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1542	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1543	int8 acc2 = 0;
				1544	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1545	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1546	int8 acc3 = 0;
				1547	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1548
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1549	// This for loop performs 4 accumulations per iteration
Georgios Pinitas	96880cf	2017-10-20 18:52:20 +0100	[diff] [blame]	1550	for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1551	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1552	short2 a0 = vload2(0, (__global short )(src0_ptr + src_addr.s0 + 0 src0_stride_y));
				1553	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1554	short2 a1 = vload2(0, (__global short )(src0_ptr + src_addr.s0 + 1 src0_stride_y));
				1555	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1556	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1557	short2 a2 = vload2(0, (__global short )(src0_ptr + src_addr.s0 + 2 src0_stride_y));
				1558	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1559	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1560	short2 a3 = vload2(0, (__global short )(src0_ptr + src_addr.s0 + 3 src0_stride_y));
				1561	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1562	short8 b0 = vload8(0, (__global short )(src1_ptr + src_addr.s1 + 0 src1_stride_y));
				1563	short8 b1 = vload8(0, (__global short )(src1_ptr + src_addr.s1 + 1 src1_stride_y));
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1564
				1565	acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
				1566	acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1567	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1568	acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);
				1569	acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);
				1570	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1571	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1572	acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);
				1573	acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);
				1574	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1575	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1576	acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);
				1577	acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);
				1578	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1579	}
				1580
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1581	// Left-over accumulations
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1582	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
				1583	{
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1584	short a0 = ((__global short )(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
				1585	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1586	short a1 = ((__global short )(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
				1587	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1588	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1589	short a2 = ((__global short )(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
				1590	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1591	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1592	short a3 = ((__global short )(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
				1593	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1594	short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
				1595
				1596	acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1597	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1598	acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);
				1599	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1600	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1601	acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);
				1602	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1603	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1604	acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);
				1605	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1606	}
				1607
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1608	// Compute destination address
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1609	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1610
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1611	// Multiply by the weight of matrix product and store the result
				1612	short8 acc_qs16;
				1613	acc_qs16 = convert_short8_sat(acc0);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1614	#if defined(ALPHA)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1615	acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1616	#endif // defined(ALPHA)
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1617	vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1618	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1619	acc_qs16 = convert_short8_sat(acc1);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1620	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1621	acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1622	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1623	vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
				1624	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
				1625	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1626	acc_qs16 = convert_short8_sat(acc2);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1627	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1628	acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1629	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1630	vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
				1631	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
				1632	#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
				1633	acc_qs16 = convert_short8_sat(acc3);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1634	#if defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1635	acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1636	#endif // defined(ALPHA)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1637	vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
				1638	#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1639	}
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	1640	#endif // defined(FIXED_POINT_POSITION)
				1641	#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1642
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1643	#if defined(BETA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1644	/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
				1645	*
				1646	* @attention The beta's value need to be passed at compile time using -DBETA
				1647	*
				1648	* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
				1649	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				1650	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1651	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				1652	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1653	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1654	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1655	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1656	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1657	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1658	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1659	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1660	*/
				1661	__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
				1662	IMAGE_DECLARATION(dst))
				1663	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1664	// Compute source and destination addresses
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1665	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				1666	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1667
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1668	// Load values from A x B
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1669	float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
				1670
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1671	// Load values from Matrix C
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1672	float4 c = vload4(0, (__global float *)src.ptr);
				1673
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1674	// Computes alpha * axb + beta * c
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1675	float4 out = alpha_ab + (float4)BETA * c;
				1676
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1677	// Store final result in axb matrix
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1678	vstore4(out, 0, (__global float *)dst.ptr);
				1679	}
				1680
				1681	/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
				1682	*
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1683	* @attention The beta's value need to be passed at compile time using -DBETA
				1684	*
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1685	* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
				1686	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				1687	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1688	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				1689	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1690	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1691	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1692	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1693	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1694	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1695	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1696	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1697	*/
				1698	__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
				1699	IMAGE_DECLARATION(dst))
				1700	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1701	// Compute source and destination addresses
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1702	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				1703	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1704
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1705	// Load values from A x B
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1706	half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
				1707
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1708	// Load values from Matrix C
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1709	half8 c = vload8(0, (__global half *)src.ptr);
				1710
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1711	// Computes alpha * axb + beta * c
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1712	half8 out = alpha_ab + (half8)BETA * c;
				1713
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1714	// Store final result in axb matrix
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1715	vstore8(out, 0, (__global half *)dst.ptr);
				1716	}
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1717
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1718	#if defined(FIXED_POINT_POSITION)
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1719	/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
				1720	*
				1721	* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
				1722	*
				1723	* @note: BETA must be passed in 8 bit fixed point format
				1724	*
				1725	* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8
				1726	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				1727	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1728	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				1729	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1730	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
				1731	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
				1732	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1733	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1734	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1735	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1736	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1737	*/
				1738	__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
				1739	IMAGE_DECLARATION(dst))
				1740	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1741	// Compute source and destination addresses
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1742	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				1743	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1744
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1745	// Load values from A x B
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1746	char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
				1747
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1748	// Load values from Matrix C
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1749	char16 c = vload16(0, (__global char *)src.ptr);
				1750
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1751	// Computes alpha * axb + beta * c
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1752	char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
				1753
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1754	// Store final result in axb matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1755	vstore16(out, 0, (__global char *)dst.ptr);
				1756	}
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1757
				1758	/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
				1759	*
				1760	* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
				1761	*
				1762	* @note: BETA must be passed in 16 bit fixed point format
				1763	*
				1764	* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16
				1765	* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
				1766	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1767	* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
				1768	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1769	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
				1770	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
				1771	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1772	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1773	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1774	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1775	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1776	*/
				1777	__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
				1778	IMAGE_DECLARATION(dst))
				1779	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1780	// Compute source and destination addresses
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1781	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				1782	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1783
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1784	// Load values from A x B
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1785	short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
				1786
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1787	// Load values from Matrix C
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1788	short8 c = vload8(0, (__global short *)src.ptr);
				1789
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1790	// Computes alpha * axb + beta * c
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1791	short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
				1792
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1793	// Store final result in axb matrix
Gian Marco Iodice	8a38369	2017-07-03 17:41:47 +0100	[diff] [blame]	1794	vstore8(out, 0, (__global short *)dst.ptr);
				1795	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1796	#endif // defined(FIXED_POINT_POSITION)
				1797	#endif // defined(BETA)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1798
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1799	#if defined(WIDTH_VECTOR_A)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1800	/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
				1801	*
				1802	* @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
				1803	*
				1804	* @attention The input A and matrix B must not be reshaped
				1805	*
				1806	* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
				1807	* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
				1808	* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1809	* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
				1810	* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1811	* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1812	* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1813	* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
				1814	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				1815	* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
				1816	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1817	* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
				1818	* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				1819	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	1820	* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1821	* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
				1822	* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
				1823	* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
				1824	* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
				1825	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
				1826	*/
				1827	__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
				1828	TENSOR3D_DECLARATION(src1),
				1829	IMAGE_DECLARATION(dst))
				1830	{
				1831	int idx = get_global_id(0) * 4;
				1832	int idy = get_global_id(1);
				1833
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1834	// Compute the address for the vector A and matrix B
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1835	int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
				1836	src_addr.s1 += idx * sizeof(float);
				1837
				1838	int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
				1839
				1840	float4 acc = 0.0f;
				1841
Georgios Pinitas	96880cf	2017-10-20 18:52:20 +0100	[diff] [blame]	1842	for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1843	{
				1844	float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
				1845	float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
				1846	float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
				1847
				1848	acc += b0 * (float4)a0.s0;
				1849	acc += b1 * (float4)a0.s1;
				1850	}
				1851
				1852	for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
				1853	{
				1854	float a0 = ((__global float )(src0_ptr + src_addr.s0));
				1855	float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
				1856
				1857	acc += b0 * (float4)a0;
				1858	}
				1859
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1860	// Compute destination address
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1861	Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
				1862
				1863	vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
				1864	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	1865	#endif // defined(WIDTH_VECTOR_A)
				1866
				1867	/** This kernel accumulates each row with the biases vector.
				1868	*
				1869	* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
				1870	* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
				1871	*
				1872	* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
				1873	* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
				1874	* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
				1875	* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
				1876	* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				1877	* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
				1878	* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
				1879	* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
				1880	* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				1881	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
				1882	*/
				1883	#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
				1884	__kernel void gemm_accumulate_biases(
				1885	IMAGE_DECLARATION(accum),
				1886	VECTOR_DECLARATION(biases))
				1887	{
				1888	Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
				1889	Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
				1890
				1891	// Vector size, i.e. number of vector elements.
				1892	VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
				1893	accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
				1894	VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
				1895	biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
				1896	#ifdef FIXED_POINT_POSITION
				1897	accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, VECTOR_SIZE);
				1898	#else // FIXED_POINT_POSITION
				1899	accum_value = biases_value + accum_value;
				1900	#endif // FIXED_POINT_POSITION
				1901	// Store result in the accumulate buffer
				1902	VSTORE(VECTOR_SIZE)
				1903	(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
				1904	}
				1905	#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)