Blame - src/core/CL/cl_kernels/common/mat_mul_quantized.cl - ml/ComputeLibrary

blob: 0c3cbca9a62b35fed0df389d3cf11063fcc6cc25 [file] [log] [blame]

Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25	#include "tile_helpers.h"
				26
				27	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				28	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
				29	*
				30	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				31	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				32	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				33	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				34	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
				35	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				36	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
				37	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				38	* - M0 > 0
				39	* - N0 = 1, 2, 3, 4, 8, 16
				40	* - K0 = 1, 2, 3, 4, 8, 16
				41	* @note Values > 8 for M0 are not expected to be efficient
				42	*
				43	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
				44	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				45	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				46	* @param[in] lhs_w The width of the lhs tensor
				47	* @param[in] lhs_h The height of the lhs tensor
				48	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				49	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				50	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				51	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				52	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				53	* @param[in] rhs_w The width of the rhs tensor
				54	* @param[in] rhs_h The height of the rhs tensor
				55	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				56	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				57	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				58	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				59	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				60	* @param[in] dst_w The width of the dst tensor
				61	* @param[in] dst_h The height of the dst tensor
				62	* @param[in] dst_n Number of the matrices (buffers) in the batch
				63	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				64	*/
				65	__kernel void mat_mul_native_quantized_nt_nt(
				66	TENSOR3D_T(lhs, BUFFER),
				67	TENSOR3D_T(rhs, BUFFER),
				68	TENSOR3D_T(dst, BUFFER))
				69	{
				70	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				71	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				72	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				73
				74	// Compute LHS/RHS/DST matrix address
				75	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				76	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				77	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				78
				79	// Initialize the accumulators
				80	TILE(int, M0, N0, acc);
				81	LOOP_UNROLLING(int, i, 0, 1, M0,
				82	{
				83	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				84	})
				85
				86	TILE(int, 1, N0, b_sum);
				87	b_sum[0].v = 0;
				88
				89	TILE(int, 1, M0, a_sum);
				90	a_sum[0].v = 0;
				91
				92	int k;
				93	for(k = 0; k <= K - K0; k += K0)
				94	{
				95	TILE(DATA_TYPE, M0, K0, a);
				96	TILE(DATA_TYPE, N0, K0, b);
				97
				98	LOOP_UNROLLING(int, i, 0, 1, M0,
				99	{
				100	a[i].v = 0;
				101	})
				102
				103	LOOP_UNROLLING(int, i, 0, 1, N0,
				104	{
				105	b[i].v = 0;
				106	})
				107
				108	// Load tile from the lhs tensor
				109	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				110
				111	// Load tile from the rhs tensor in a transposed fashion
				112	// in order to use T_MMUL_NT_T macro because only this macro
				113	// can utilize dot product instruction for Int8/UInt8 by
				114	// directly multiplying the rows of Lhs and Rhs tensors.
				115	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				116
				117	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				118
				119	LOOP_UNROLLING(int, i, 0, 1, M0,
				120	{
				121	LOOP_UNROLLING(int, j, 0, 1, K0,
				122	{
				123	a_sum[0].s[i] += (int)a[i].s[j];
				124	})
				125	})
				126
				127	LOOP_UNROLLING(int, i, 0, 1, K0,
				128	{
				129	LOOP_UNROLLING(int, j, 0, 1, N0,
				130	{
				131	b_sum[0].s[j] += (int)b[j].s[i];
				132	})
				133	})
				134
				135	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				136	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				137	}
				138
				139	#if((K % K0) != 0)
				140	/* Leftover Loop */
				141	for(; k < K; ++k)
				142	{
				143	TILE(DATA_TYPE, M0, 1, a);
				144	TILE(DATA_TYPE, N0, 1, b);
				145
				146	LOOP_UNROLLING(int, i, 0, 1, M0,
				147	{
				148	a[i].v = 0;
				149	})
				150
				151	LOOP_UNROLLING(int, i, 0, 1, N0,
				152	{
				153	b[i].v = 0;
				154	})
				155
				156	// Load tile from the lhs tensor
				157	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				158
				159	// Load tile from the rhs tensor in a transposed fashion.
				160	// See the main loop for more explanation
				161	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				162
				163	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				164
				165	LOOP_UNROLLING(int, i, 0, 1, M0,
				166	{
				167	LOOP_UNROLLING(int, j, 0, 1, 1,
				168	{
				169	a_sum[0].s[i] += (int)a[i].s[j];
				170	})
				171	})
				172
				173	LOOP_UNROLLING(int, i, 0, 1, 1,
				174	{
				175	LOOP_UNROLLING(int, j, 0, 1, N0,
				176	{
				177	b_sum[0].s[j] += (int)b[j].s[i];
				178	})
				179	})
				180
				181	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				182	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				183	}
				184	#endif // ((K % K0) != 0)
				185
				186	LOOP_UNROLLING(int, i, 0, 1, M0,
				187	{
				188	LOOP_UNROLLING(int, j, 0, 1, N0,
				189	{
				190	acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
				191	})
				192	})
				193
				194	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				195	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				196
				197	// Quantize the tile
				198	TILE(DATA_TYPE, M0, N0, accq);
				199	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				200
				201	TILE(int, M0, 1, indirect_buffer);
				202	LOOP_UNROLLING(int, _i, 0, 1, M0,
				203	{
				204	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				205	});
				206
				207	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				208	}
				209	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				210
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame^]	211	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				212	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
				213	*
				214	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				215	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				216	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				217	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				218	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
				219	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				220	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
				221	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				222	* - M0 > 0
				223	* - N0 = 1, 2, 3, 4, 8, 16
				224	* - K0 = 1, 2, 3, 4, 8, 16
				225	* @note Values > 8 for M0, N0, K0 are not expected to be efficient
				226	*
				227	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				228	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				229	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				230	* @param[in] lhs_w The width of the lhs tensor
				231	* @param[in] lhs_h The height of the lhs tensor
				232	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				233	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				234	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				235	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				236	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				237	* @param[in] rhs_w The width of the rhs tensor
				238	* @param[in] rhs_h The height of the rhs tensor
				239	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				240	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				241	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				242	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				243	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				244	* @param[in] dst_w The width of the dst tensor
				245	* @param[in] dst_h The height of the dst tensor
				246	* @param[in] dst_n Number of the matrices (buffers) in the batch
				247	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				248	*/
				249	__kernel void mat_mul_native_quantized_nt_t(
				250	TENSOR3D_T(lhs, BUFFER),
				251	TENSOR3D_T(rhs, BUFFER),
				252	TENSOR3D_T(dst, BUFFER))
				253	{
				254	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				255	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				256	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				257
				258	// Compute LHS/RHS/DST matrix address
				259	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				260	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				261	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				262
				263	// Initialize the accumulators
				264	TILE(int, M0, N0, acc);
				265	LOOP_UNROLLING(int, i, 0, 1, M0,
				266	{
				267	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				268	})
				269
				270	TILE(int, 1, M0, a_sum);
				271	a_sum[0].v = 0;
				272
				273	TILE(int, 1, N0, b_sum);
				274	b_sum[0].v = 0;
				275
				276	int k;
				277	for(k = 0; k <= K - K0; k += K0)
				278	{
				279	TILE(DATA_TYPE, M0, K0, a);
				280	TILE(DATA_TYPE, N0, K0, b);
				281
				282	LOOP_UNROLLING(int, i, 0, 1, M0,
				283	{
				284	a[i].v = 0;
				285	})
				286
				287	LOOP_UNROLLING(int, i, 0, 1, N0,
				288	{
				289	b[i].v = 0;
				290	})
				291
				292	// Load tile from lhs/rhs tensors
				293	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				294	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				295
				296	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				297
				298	LOOP_UNROLLING(int, i, 0, 1, M0,
				299	{
				300	LOOP_UNROLLING(int, j, 0, 1, K0,
				301	{
				302	a_sum[0].s[i] += (int)a[i].s[j];
				303	})
				304	})
				305
				306	LOOP_UNROLLING(int, i, 0, 1, N0,
				307	{
				308	LOOP_UNROLLING(int, j, 0, 1, K0,
				309	{
				310	b_sum[0].s[i] += (int)b[i].s[j];
				311	})
				312	})
				313
				314	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				315	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				316	}
				317
				318	#if ((K % K0) != 0)
				319	// Leftover loop
				320	for(; k < K; ++k)
				321	{
				322	TILE(DATA_TYPE, M0, 1, a);
				323	TILE(DATA_TYPE, N0, 1, b);
				324
				325	LOOP_UNROLLING(int, i, 0, 1, M0,
				326	{
				327	a[i].v = 0;
				328	})
				329
				330	LOOP_UNROLLING(int, i, 0, 1, N0,
				331	{
				332	b[i].v = 0;
				333	})
				334
				335	// Load tile from lhs/rhs tensors
				336	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				337	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				338
				339	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				340
				341	LOOP_UNROLLING(int, i, 0, 1, M0,
				342	{
				343	LOOP_UNROLLING(int, j, 0, 1, 1,
				344	{
				345	a_sum[0].s[i] += (int)a[i].s[j];
				346	})
				347	})
				348
				349	LOOP_UNROLLING(int, i, 0, 1, N0,
				350	{
				351	LOOP_UNROLLING(int, j, 0, 1, 1,
				352	{
				353	b_sum[0].s[i] += (int)b[i].s[j];
				354	})
				355	})
				356
				357	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				358	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				359	}
				360	#endif // ((K % K0) != 0)
				361
				362	LOOP_UNROLLING(int, i, 0, 1, M0,
				363	{
				364	LOOP_UNROLLING(int, j, 0, 1, N0,
				365	{
				366	acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
				367	})
				368	})
				369
				370	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				371	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				372
				373	// Quantize the tile
				374	TILE(DATA_TYPE, M0, N0, accq);
				375	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				376
				377	TILE(int, M0, 1, indirect_buffer);
				378	LOOP_UNROLLING(int, _i, 0, 1, M0,
				379	{
				380	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				381	});
				382
				383	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				384	}
				385	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				386
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	387	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
				388	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
				389	*
				390	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				391	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				392	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				393	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				394	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
				395	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				396	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
				397	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				398	* - M0 > 0
				399	* - N0 = 1, 2, 3, 4, 8, 16
				400	* - K0 = 1, 2, 3, 4, 8, 16
				401	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				402	*
				403	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				404	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				405	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				406	* @param[in] lhs_w The width of the lhs tensor
				407	* @param[in] lhs_h The height of the lhs tensor
				408	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				409	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				410	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				411	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				412	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				413	* @param[in] rhs_w The width of the rhs tensor
				414	* @param[in] rhs_h The height of the rhs tensor
				415	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				416	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				417	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				418	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				419	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				420	* @param[in] dst_w The width of the dst tensor
				421	* @param[in] dst_h The height of the dst tensor
				422	* @param[in] dst_n Number of the matrices (buffers) in the batch
				423	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				424	*/
				425	__kernel void mat_mul_native_quantized_t_nt(
				426	TENSOR3D_T(lhs, BUFFER),
				427	TENSOR3D_T(rhs, BUFFER),
				428	TENSOR3D_T(dst, BUFFER))
				429	{
				430	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				431	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				432	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				433
				434	// Compute LHS/RHS/DST matrix address
				435	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				436	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				437	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				438
				439	// Initialize the accumulators
				440	TILE(int, M0, N0, acc);
				441	LOOP_UNROLLING(int, i, 0, 1, M0,
				442	{
				443	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				444	})
				445
				446	TILE(int, 1, N0, b_sum);
				447	b_sum[0].v = 0;
				448
				449	TILE(int, 1, M0, a_sum);
				450	a_sum[0].v = 0;
				451
				452	int k;
				453	for(k = 0; k <= K - K0; k += K0)
				454	{
				455	TILE(DATA_TYPE, M0, K0, a);
				456	TILE(DATA_TYPE, N0, K0, b);
				457
				458	LOOP_UNROLLING(int, i, 0, 1, M0,
				459	{
				460	a[i].v = 0;
				461	})
				462
				463	LOOP_UNROLLING(int, i, 0, 1, N0,
				464	{
				465	b[i].v = 0;
				466	})
				467
				468	// Load tile from the lhs/rhs tensors in a transposed fashion
				469	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				470	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				471	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				472
				473	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				474
				475	LOOP_UNROLLING(int, i, 0, 1, K0,
				476	{
				477	LOOP_UNROLLING(int, j, 0, 1, M0,
				478	{
				479	a_sum[0].s[j] += (int)a[j].s[i];
				480	})
				481	})
				482
				483	LOOP_UNROLLING(int, i, 0, 1, K0,
				484	{
				485	LOOP_UNROLLING(int, j, 0, 1, N0,
				486	{
				487	b_sum[0].s[j] += (int)b[j].s[i];
				488	})
				489	})
				490
				491	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				492	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				493	}
				494
				495	#if((K % K0) != 0)
				496	/* Leftover Loop */
				497	for(; k < K; ++k)
				498	{
				499	TILE(DATA_TYPE, M0, 1, a);
				500	TILE(DATA_TYPE, N0, 1, b);
				501
				502	LOOP_UNROLLING(int, i, 0, 1, M0,
				503	{
				504	a[i].v = 0;
				505	})
				506
				507	LOOP_UNROLLING(int, i, 0, 1, N0,
				508	{
				509	b[i].v = 0;
				510	})
				511
				512	// Load tile from the lhs/rhs tensors in a transposed fashion
				513	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				514	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				515	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				516
				517	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				518
				519	LOOP_UNROLLING(int, i, 0, 1, 1,
				520	{
				521	LOOP_UNROLLING(int, j, 0, 1, M0,
				522	{
				523	a_sum[0].s[j] += (int)a[j].s[i];
				524	})
				525	})
				526
				527	LOOP_UNROLLING(int, i, 0, 1, 1,
				528	{
				529	LOOP_UNROLLING(int, j, 0, 1, N0,
				530	{
				531	b_sum[0].s[j] += (int)b[j].s[i];
				532	})
				533	})
				534
				535	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				536	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				537	}
				538	#endif // ((K % K0) != 0)
				539
				540	LOOP_UNROLLING(int, i, 0, 1, M0,
				541	{
				542	LOOP_UNROLLING(int, j, 0, 1, N0,
				543	{
				544	acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
				545	})
				546	})
				547
				548	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				549	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				550
				551	// Quantize the tile
				552	TILE(DATA_TYPE, M0, N0, accq);
				553	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				554
				555	TILE(int, M0, 1, indirect_buffer);
				556	LOOP_UNROLLING(int, _i, 0, 1, M0,
				557	{
				558	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				559	});
				560
				561	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				562	}
				563	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	564
				565	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
				566	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
				567	*
				568	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				569	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				570	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				571	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				572	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
				573	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				574	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
				575	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				576	* - M0 = 1, 2, 3, 4, 8, 16
				577	* - N0 = 1, 2, 3, 4, 8, 16
				578	* - K0 = 1, 2, 3, 4, 8, 16
				579	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				580	*
				581	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				582	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				583	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				584	* @param[in] lhs_w The width of the lhs tensor
				585	* @param[in] lhs_h The height of the lhs tensor
				586	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				587	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				588	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				589	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				590	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				591	* @param[in] rhs_w The width of the rhs tensor
				592	* @param[in] rhs_h The height of the rhs tensor
				593	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				594	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				595	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				596	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				597	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				598	* @param[in] dst_w The width of the dst tensor
				599	* @param[in] dst_h The height of the dst tensor
				600	* @param[in] dst_n Number of the matrices (buffers) in the batch
				601	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				602	*/
				603	__kernel void mat_mul_native_quantized_t_t(
				604	TENSOR3D_T(lhs, BUFFER),
				605	TENSOR3D_T(rhs, BUFFER),
				606	TENSOR3D_T(dst, BUFFER))
				607	{
				608	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				609	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				610	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				611
				612	// Compute LHS/RHS/DST matrix address
				613	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				614	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				615	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				616
				617	// Initialize the accumulators
				618	TILE(int, M0, N0, acc);
				619	LOOP_UNROLLING(int, i, 0, 1, M0,
				620	{
				621	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				622	})
				623
				624	TILE(int, 1, M0, a_sum);
				625	a_sum[0].v = 0;
				626
				627	TILE(int, 1, N0, b_sum);
				628	b_sum[0].v = 0;
				629
				630	int k;
				631	for(k = 0; k <= K - K0; k += K0)
				632	{
				633	TILE(DATA_TYPE, M0, K0, a);
				634	TILE(DATA_TYPE, N0, K0, b);
				635
				636	LOOP_UNROLLING(int, i, 0, 1, M0,
				637	{
				638	a[i].v = 0;
				639	})
				640
				641	LOOP_UNROLLING(int, i, 0, 1, N0,
				642	{
				643	b[i].v = 0;
				644	})
				645
				646	// Load tile from the lhs tensor in a transposed fashion
				647	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				648	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				649
				650	// Load tile from the rhs tensor
				651	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				652
				653	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				654
				655	LOOP_UNROLLING(int, i, 0, 1, K0,
				656	{
				657	LOOP_UNROLLING(int, j, 0, 1, M0,
				658	{
				659	a_sum[0].s[j] += (int)a[j].s[i];
				660	})
				661	})
				662
				663	LOOP_UNROLLING(int, i, 0, 1, N0,
				664	{
				665	LOOP_UNROLLING(int, j, 0, 1, K0,
				666	{
				667	b_sum[0].s[i] += (int)b[i].s[j];
				668	})
				669	})
				670
				671	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				672	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				673	}
				674
				675	#if((K % K0) != 0)
				676	/* Leftover Loop */
				677	for(; k < K; ++k)
				678	{
				679	TILE(DATA_TYPE, M0, 1, a);
				680	TILE(DATA_TYPE, N0, 1, b);
				681
				682	LOOP_UNROLLING(int, i, 0, 1, M0,
				683	{
				684	a[i].v = 0;
				685	})
				686
				687	LOOP_UNROLLING(int, i, 0, 1, N0,
				688	{
				689	b[i].v = 0;
				690	})
				691
				692	// Load tile from the lhs tensor in a transposed fashion
				693	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				694	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				695
				696	// Load tile from the rhs tensor
				697	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				698
				699	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				700
				701	LOOP_UNROLLING(int, i, 0, 1, 1,
				702	{
				703	LOOP_UNROLLING(int, j, 0, 1, M0,
				704	{
				705	a_sum[0].s[j] += (int)a[j].s[i];
				706	})
				707	})
				708
				709	LOOP_UNROLLING(int, i, 0, 1, N0,
				710	{
				711	LOOP_UNROLLING(int, j, 0, 1, 1,
				712	{
				713	b_sum[0].s[i] += (int)b[i].s[j];
				714	})
				715	})
				716
				717	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				718	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				719	}
				720	#endif // ((K % K0) != 0)
				721
				722	LOOP_UNROLLING(int, i, 0, 1, M0,
				723	{
				724	LOOP_UNROLLING(int, j, 0, 1, N0,
				725	{
				726	acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
				727	})
				728	})
				729
				730	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				731	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				732
				733	// Quantize the tile
				734	TILE(DATA_TYPE, M0, N0, accq);
				735	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				736
				737	TILE(int, M0, 1, indirect_buffer);
				738	LOOP_UNROLLING(int, _i, 0, 1, M0,
				739	{
				740	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				741	});
				742
				743	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				744	}
				745	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)