Blame - src/core/CL/cl_kernels/common/mat_mul_quantized.cl - ml/ComputeLibrary

blob: 8cf857dd848ad9c786f3ece08921f438277ca009 [file] [log] [blame]

Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame^]	24	#include "activation_float_helpers.h"
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	25	#include "helpers.h"
				26	#include "tile_helpers.h"
				27
				28	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				29	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
				30	*
				31	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				32	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				33	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				34	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				35	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	36	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	37	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				38	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
				39	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				40	* - M0 > 0
				41	* - N0 = 1, 2, 3, 4, 8, 16
				42	* - K0 = 1, 2, 3, 4, 8, 16
				43	* @note Values > 8 for M0 are not expected to be efficient
				44	*
				45	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
				46	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				47	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				48	* @param[in] lhs_w The width of the lhs tensor
				49	* @param[in] lhs_h The height of the lhs tensor
				50	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				51	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				52	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				53	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				54	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				55	* @param[in] rhs_w The width of the rhs tensor
				56	* @param[in] rhs_h The height of the rhs tensor
				57	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				58	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				59	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				60	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				61	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				62	* @param[in] dst_w The width of the dst tensor
				63	* @param[in] dst_h The height of the dst tensor
				64	* @param[in] dst_n Number of the matrices (buffers) in the batch
				65	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				66	*/
				67	__kernel void mat_mul_native_quantized_nt_nt(
				68	TENSOR3D_T(lhs, BUFFER),
				69	TENSOR3D_T(rhs, BUFFER),
				70	TENSOR3D_T(dst, BUFFER))
				71	{
				72	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				73	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				74	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				75
				76	// Compute LHS/RHS/DST matrix address
				77	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				78	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				79	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				80
				81	// Initialize the accumulators
				82	TILE(int, M0, N0, acc);
				83	LOOP_UNROLLING(int, i, 0, 1, M0,
				84	{
				85	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				86	})
				87
				88	TILE(int, 1, N0, b_sum);
				89	b_sum[0].v = 0;
				90
				91	TILE(int, 1, M0, a_sum);
				92	a_sum[0].v = 0;
				93
				94	int k;
				95	for(k = 0; k <= K - K0; k += K0)
				96	{
				97	TILE(DATA_TYPE, M0, K0, a);
				98	TILE(DATA_TYPE, N0, K0, b);
				99
				100	LOOP_UNROLLING(int, i, 0, 1, M0,
				101	{
				102	a[i].v = 0;
				103	})
				104
				105	LOOP_UNROLLING(int, i, 0, 1, N0,
				106	{
				107	b[i].v = 0;
				108	})
				109
				110	// Load tile from the lhs tensor
				111	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				112
				113	// Load tile from the rhs tensor in a transposed fashion
				114	// in order to use T_MMUL_NT_T macro because only this macro
				115	// can utilize dot product instruction for Int8/UInt8 by
				116	// directly multiplying the rows of Lhs and Rhs tensors.
				117	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				118
				119	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				120
				121	LOOP_UNROLLING(int, i, 0, 1, M0,
				122	{
				123	LOOP_UNROLLING(int, j, 0, 1, K0,
				124	{
				125	a_sum[0].s[i] += (int)a[i].s[j];
				126	})
				127	})
				128
				129	LOOP_UNROLLING(int, i, 0, 1, K0,
				130	{
				131	LOOP_UNROLLING(int, j, 0, 1, N0,
				132	{
				133	b_sum[0].s[j] += (int)b[j].s[i];
				134	})
				135	})
				136
				137	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				138	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				139	}
				140
				141	#if((K % K0) != 0)
				142	/* Leftover Loop */
				143	for(; k < K; ++k)
				144	{
				145	TILE(DATA_TYPE, M0, 1, a);
				146	TILE(DATA_TYPE, N0, 1, b);
				147
				148	LOOP_UNROLLING(int, i, 0, 1, M0,
				149	{
				150	a[i].v = 0;
				151	})
				152
				153	LOOP_UNROLLING(int, i, 0, 1, N0,
				154	{
				155	b[i].v = 0;
				156	})
				157
				158	// Load tile from the lhs tensor
				159	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				160
				161	// Load tile from the rhs tensor in a transposed fashion.
				162	// See the main loop for more explanation
				163	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				164
				165	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				166
				167	LOOP_UNROLLING(int, i, 0, 1, M0,
				168	{
				169	LOOP_UNROLLING(int, j, 0, 1, 1,
				170	{
				171	a_sum[0].s[i] += (int)a[i].s[j];
				172	})
				173	})
				174
				175	LOOP_UNROLLING(int, i, 0, 1, 1,
				176	{
				177	LOOP_UNROLLING(int, j, 0, 1, N0,
				178	{
				179	b_sum[0].s[j] += (int)b[j].s[i];
				180	})
				181	})
				182
				183	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				184	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				185	}
				186	#endif // ((K % K0) != 0)
				187
				188	LOOP_UNROLLING(int, i, 0, 1, M0,
				189	{
				190	LOOP_UNROLLING(int, j, 0, 1, N0,
				191	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame^]	192	acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	193	})
				194	})
				195
				196	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				197	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				198
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	199	T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
				200
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	201	// Quantize the tile
				202	TILE(DATA_TYPE, M0, N0, accq);
				203	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				204
				205	TILE(int, M0, 1, indirect_buffer);
				206	LOOP_UNROLLING(int, _i, 0, 1, M0,
				207	{
				208	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				209	});
				210
				211	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				212	}
				213	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				214
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	215	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				216	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
				217	*
				218	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				219	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				220	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				221	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				222	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	223	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	224	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				225	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
				226	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				227	* - M0 > 0
				228	* - N0 = 1, 2, 3, 4, 8, 16
				229	* - K0 = 1, 2, 3, 4, 8, 16
				230	* @note Values > 8 for M0, N0, K0 are not expected to be efficient
				231	*
				232	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				233	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				234	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				235	* @param[in] lhs_w The width of the lhs tensor
				236	* @param[in] lhs_h The height of the lhs tensor
				237	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				238	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				239	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				240	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				241	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				242	* @param[in] rhs_w The width of the rhs tensor
				243	* @param[in] rhs_h The height of the rhs tensor
				244	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				245	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				246	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				247	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				248	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				249	* @param[in] dst_w The width of the dst tensor
				250	* @param[in] dst_h The height of the dst tensor
				251	* @param[in] dst_n Number of the matrices (buffers) in the batch
				252	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				253	*/
				254	__kernel void mat_mul_native_quantized_nt_t(
				255	TENSOR3D_T(lhs, BUFFER),
				256	TENSOR3D_T(rhs, BUFFER),
				257	TENSOR3D_T(dst, BUFFER))
				258	{
				259	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				260	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				261	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				262
				263	// Compute LHS/RHS/DST matrix address
				264	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				265	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				266	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				267
				268	// Initialize the accumulators
				269	TILE(int, M0, N0, acc);
				270	LOOP_UNROLLING(int, i, 0, 1, M0,
				271	{
				272	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				273	})
				274
				275	TILE(int, 1, M0, a_sum);
				276	a_sum[0].v = 0;
				277
				278	TILE(int, 1, N0, b_sum);
				279	b_sum[0].v = 0;
				280
				281	int k;
				282	for(k = 0; k <= K - K0; k += K0)
				283	{
				284	TILE(DATA_TYPE, M0, K0, a);
				285	TILE(DATA_TYPE, N0, K0, b);
				286
				287	LOOP_UNROLLING(int, i, 0, 1, M0,
				288	{
				289	a[i].v = 0;
				290	})
				291
				292	LOOP_UNROLLING(int, i, 0, 1, N0,
				293	{
				294	b[i].v = 0;
				295	})
				296
				297	// Load tile from lhs/rhs tensors
				298	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				299	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				300
				301	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				302
				303	LOOP_UNROLLING(int, i, 0, 1, M0,
				304	{
				305	LOOP_UNROLLING(int, j, 0, 1, K0,
				306	{
				307	a_sum[0].s[i] += (int)a[i].s[j];
				308	})
				309	})
				310
				311	LOOP_UNROLLING(int, i, 0, 1, N0,
				312	{
				313	LOOP_UNROLLING(int, j, 0, 1, K0,
				314	{
				315	b_sum[0].s[i] += (int)b[i].s[j];
				316	})
				317	})
				318
				319	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				320	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				321	}
				322
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	323	#if((K % K0) != 0)
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	324	// Leftover loop
				325	for(; k < K; ++k)
				326	{
				327	TILE(DATA_TYPE, M0, 1, a);
				328	TILE(DATA_TYPE, N0, 1, b);
				329
				330	LOOP_UNROLLING(int, i, 0, 1, M0,
				331	{
				332	a[i].v = 0;
				333	})
				334
				335	LOOP_UNROLLING(int, i, 0, 1, N0,
				336	{
				337	b[i].v = 0;
				338	})
				339
				340	// Load tile from lhs/rhs tensors
				341	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				342	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				343
				344	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				345
				346	LOOP_UNROLLING(int, i, 0, 1, M0,
				347	{
				348	LOOP_UNROLLING(int, j, 0, 1, 1,
				349	{
				350	a_sum[0].s[i] += (int)a[i].s[j];
				351	})
				352	})
				353
				354	LOOP_UNROLLING(int, i, 0, 1, N0,
				355	{
				356	LOOP_UNROLLING(int, j, 0, 1, 1,
				357	{
				358	b_sum[0].s[i] += (int)b[i].s[j];
				359	})
				360	})
				361
				362	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				363	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				364	}
				365	#endif // ((K % K0) != 0)
				366
				367	LOOP_UNROLLING(int, i, 0, 1, M0,
				368	{
				369	LOOP_UNROLLING(int, j, 0, 1, N0,
				370	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame^]	371	acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	372	})
				373	})
				374
				375	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				376	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				377
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	378	T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
				379
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	380	// Quantize the tile
				381	TILE(DATA_TYPE, M0, N0, accq);
				382	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				383
				384	TILE(int, M0, 1, indirect_buffer);
				385	LOOP_UNROLLING(int, _i, 0, 1, M0,
				386	{
				387	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				388	});
				389
				390	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				391	}
				392	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				393
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	394	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
				395	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
				396	*
				397	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				398	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				399	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				400	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				401	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	402	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	403	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				404	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
				405	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				406	* - M0 > 0
				407	* - N0 = 1, 2, 3, 4, 8, 16
				408	* - K0 = 1, 2, 3, 4, 8, 16
				409	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				410	*
				411	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				412	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				413	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				414	* @param[in] lhs_w The width of the lhs tensor
				415	* @param[in] lhs_h The height of the lhs tensor
				416	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				417	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				418	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				419	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				420	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				421	* @param[in] rhs_w The width of the rhs tensor
				422	* @param[in] rhs_h The height of the rhs tensor
				423	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				424	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				425	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				426	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				427	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				428	* @param[in] dst_w The width of the dst tensor
				429	* @param[in] dst_h The height of the dst tensor
				430	* @param[in] dst_n Number of the matrices (buffers) in the batch
				431	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				432	*/
				433	__kernel void mat_mul_native_quantized_t_nt(
				434	TENSOR3D_T(lhs, BUFFER),
				435	TENSOR3D_T(rhs, BUFFER),
				436	TENSOR3D_T(dst, BUFFER))
				437	{
				438	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				439	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				440	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				441
				442	// Compute LHS/RHS/DST matrix address
				443	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				444	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				445	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				446
				447	// Initialize the accumulators
				448	TILE(int, M0, N0, acc);
				449	LOOP_UNROLLING(int, i, 0, 1, M0,
				450	{
				451	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				452	})
				453
				454	TILE(int, 1, N0, b_sum);
				455	b_sum[0].v = 0;
				456
				457	TILE(int, 1, M0, a_sum);
				458	a_sum[0].v = 0;
				459
				460	int k;
				461	for(k = 0; k <= K - K0; k += K0)
				462	{
				463	TILE(DATA_TYPE, M0, K0, a);
				464	TILE(DATA_TYPE, N0, K0, b);
				465
				466	LOOP_UNROLLING(int, i, 0, 1, M0,
				467	{
				468	a[i].v = 0;
				469	})
				470
				471	LOOP_UNROLLING(int, i, 0, 1, N0,
				472	{
				473	b[i].v = 0;
				474	})
				475
				476	// Load tile from the lhs/rhs tensors in a transposed fashion
				477	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				478	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				479	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				480
				481	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				482
				483	LOOP_UNROLLING(int, i, 0, 1, K0,
				484	{
				485	LOOP_UNROLLING(int, j, 0, 1, M0,
				486	{
				487	a_sum[0].s[j] += (int)a[j].s[i];
				488	})
				489	})
				490
				491	LOOP_UNROLLING(int, i, 0, 1, K0,
				492	{
				493	LOOP_UNROLLING(int, j, 0, 1, N0,
				494	{
				495	b_sum[0].s[j] += (int)b[j].s[i];
				496	})
				497	})
				498
				499	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				500	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				501	}
				502
				503	#if((K % K0) != 0)
				504	/* Leftover Loop */
				505	for(; k < K; ++k)
				506	{
				507	TILE(DATA_TYPE, M0, 1, a);
				508	TILE(DATA_TYPE, N0, 1, b);
				509
				510	LOOP_UNROLLING(int, i, 0, 1, M0,
				511	{
				512	a[i].v = 0;
				513	})
				514
				515	LOOP_UNROLLING(int, i, 0, 1, N0,
				516	{
				517	b[i].v = 0;
				518	})
				519
				520	// Load tile from the lhs/rhs tensors in a transposed fashion
				521	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				522	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				523	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				524
				525	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				526
				527	LOOP_UNROLLING(int, i, 0, 1, 1,
				528	{
				529	LOOP_UNROLLING(int, j, 0, 1, M0,
				530	{
				531	a_sum[0].s[j] += (int)a[j].s[i];
				532	})
				533	})
				534
				535	LOOP_UNROLLING(int, i, 0, 1, 1,
				536	{
				537	LOOP_UNROLLING(int, j, 0, 1, N0,
				538	{
				539	b_sum[0].s[j] += (int)b[j].s[i];
				540	})
				541	})
				542
				543	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				544	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				545	}
				546	#endif // ((K % K0) != 0)
				547
				548	LOOP_UNROLLING(int, i, 0, 1, M0,
				549	{
				550	LOOP_UNROLLING(int, j, 0, 1, N0,
				551	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame^]	552	acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	553	})
				554	})
				555
				556	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				557	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				558
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	559	T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
				560
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	561	// Quantize the tile
				562	TILE(DATA_TYPE, M0, N0, accq);
				563	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				564
				565	TILE(int, M0, 1, indirect_buffer);
				566	LOOP_UNROLLING(int, _i, 0, 1, M0,
				567	{
				568	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				569	});
				570
				571	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				572	}
				573	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	574
				575	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
				576	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
				577	*
				578	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				579	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				580	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				581	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				582	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	583	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	584	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				585	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
				586	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				587	* - M0 = 1, 2, 3, 4, 8, 16
				588	* - N0 = 1, 2, 3, 4, 8, 16
				589	* - K0 = 1, 2, 3, 4, 8, 16
				590	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				591	*
				592	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				593	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				594	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				595	* @param[in] lhs_w The width of the lhs tensor
				596	* @param[in] lhs_h The height of the lhs tensor
				597	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				598	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				599	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				600	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				601	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				602	* @param[in] rhs_w The width of the rhs tensor
				603	* @param[in] rhs_h The height of the rhs tensor
				604	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				605	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				606	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				607	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				608	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				609	* @param[in] dst_w The width of the dst tensor
				610	* @param[in] dst_h The height of the dst tensor
				611	* @param[in] dst_n Number of the matrices (buffers) in the batch
				612	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				613	*/
				614	__kernel void mat_mul_native_quantized_t_t(
				615	TENSOR3D_T(lhs, BUFFER),
				616	TENSOR3D_T(rhs, BUFFER),
				617	TENSOR3D_T(dst, BUFFER))
				618	{
				619	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				620	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				621	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				622
				623	// Compute LHS/RHS/DST matrix address
				624	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				625	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				626	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				627
				628	// Initialize the accumulators
				629	TILE(int, M0, N0, acc);
				630	LOOP_UNROLLING(int, i, 0, 1, M0,
				631	{
				632	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				633	})
				634
				635	TILE(int, 1, M0, a_sum);
				636	a_sum[0].v = 0;
				637
				638	TILE(int, 1, N0, b_sum);
				639	b_sum[0].v = 0;
				640
				641	int k;
				642	for(k = 0; k <= K - K0; k += K0)
				643	{
				644	TILE(DATA_TYPE, M0, K0, a);
				645	TILE(DATA_TYPE, N0, K0, b);
				646
				647	LOOP_UNROLLING(int, i, 0, 1, M0,
				648	{
				649	a[i].v = 0;
				650	})
				651
				652	LOOP_UNROLLING(int, i, 0, 1, N0,
				653	{
				654	b[i].v = 0;
				655	})
				656
				657	// Load tile from the lhs tensor in a transposed fashion
				658	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				659	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				660
				661	// Load tile from the rhs tensor
				662	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				663
				664	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				665
				666	LOOP_UNROLLING(int, i, 0, 1, K0,
				667	{
				668	LOOP_UNROLLING(int, j, 0, 1, M0,
				669	{
				670	a_sum[0].s[j] += (int)a[j].s[i];
				671	})
				672	})
				673
				674	LOOP_UNROLLING(int, i, 0, 1, N0,
				675	{
				676	LOOP_UNROLLING(int, j, 0, 1, K0,
				677	{
				678	b_sum[0].s[i] += (int)b[i].s[j];
				679	})
				680	})
				681
				682	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				683	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				684	}
				685
				686	#if((K % K0) != 0)
				687	/* Leftover Loop */
				688	for(; k < K; ++k)
				689	{
				690	TILE(DATA_TYPE, M0, 1, a);
				691	TILE(DATA_TYPE, N0, 1, b);
				692
				693	LOOP_UNROLLING(int, i, 0, 1, M0,
				694	{
				695	a[i].v = 0;
				696	})
				697
				698	LOOP_UNROLLING(int, i, 0, 1, N0,
				699	{
				700	b[i].v = 0;
				701	})
				702
				703	// Load tile from the lhs tensor in a transposed fashion
				704	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				705	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				706
				707	// Load tile from the rhs tensor
				708	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				709
				710	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				711
				712	LOOP_UNROLLING(int, i, 0, 1, 1,
				713	{
				714	LOOP_UNROLLING(int, j, 0, 1, M0,
				715	{
				716	a_sum[0].s[j] += (int)a[j].s[i];
				717	})
				718	})
				719
				720	LOOP_UNROLLING(int, i, 0, 1, N0,
				721	{
				722	LOOP_UNROLLING(int, j, 0, 1, 1,
				723	{
				724	b_sum[0].s[i] += (int)b[i].s[j];
				725	})
				726	})
				727
				728	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				729	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				730	}
				731	#endif // ((K % K0) != 0)
				732
				733	LOOP_UNROLLING(int, i, 0, 1, M0,
				734	{
				735	LOOP_UNROLLING(int, j, 0, 1, N0,
				736	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame^]	737	acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	738	})
				739	})
				740
				741	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				742	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				743
				744	// Quantize the tile
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	745	T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
				746
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	747	TILE(DATA_TYPE, M0, N0, accq);
				748	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				749
				750	TILE(int, M0, 1, indirect_buffer);
				751	LOOP_UNROLLING(int, _i, 0, 1, M0,
				752	{
				753	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				754	});
				755
				756	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				757	}
				758	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)