Blame - src/core/CL/cl_kernels/common/mat_mul_quantized.cl - ml/ComputeLibrary

blob: 7029af2188bbdc299be8aedfb0c856dd45e31232 [file] [log] [blame]

Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame]	24	#include "activation_float_helpers.h"
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	25	#include "helpers.h"
				26	#include "tile_helpers.h"
				27
				28	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				29	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
				30	*
				31	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				32	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				33	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				34	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				35	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	36	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	37	* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	38	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				39	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
				40	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				41	* - M0 > 0
				42	* - N0 = 1, 2, 3, 4, 8, 16
				43	* - K0 = 1, 2, 3, 4, 8, 16
				44	* @note Values > 8 for M0 are not expected to be efficient
				45	*
				46	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
				47	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				48	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				49	* @param[in] lhs_w The width of the lhs tensor
				50	* @param[in] lhs_h The height of the lhs tensor
				51	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				52	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				53	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				54	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				55	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				56	* @param[in] rhs_w The width of the rhs tensor
				57	* @param[in] rhs_h The height of the rhs tensor
				58	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				59	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				60	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				61	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				62	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				63	* @param[in] dst_w The width of the dst tensor
				64	* @param[in] dst_h The height of the dst tensor
				65	* @param[in] dst_n Number of the matrices (buffers) in the batch
				66	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				67	*/
				68	__kernel void mat_mul_native_quantized_nt_nt(
				69	TENSOR3D_T(lhs, BUFFER),
				70	TENSOR3D_T(rhs, BUFFER),
				71	TENSOR3D_T(dst, BUFFER))
				72	{
				73	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				74	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				75	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				76
				77	// Compute LHS/RHS/DST matrix address
				78	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				79	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				80	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				81
				82	// Initialize the accumulators
				83	TILE(int, M0, N0, acc);
				84	LOOP_UNROLLING(int, i, 0, 1, M0,
				85	{
				86	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				87	})
				88
				89	TILE(int, 1, N0, b_sum);
				90	b_sum[0].v = 0;
				91
				92	TILE(int, 1, M0, a_sum);
				93	a_sum[0].v = 0;
				94
				95	int k;
				96	for(k = 0; k <= K - K0; k += K0)
				97	{
				98	TILE(DATA_TYPE, M0, K0, a);
				99	TILE(DATA_TYPE, N0, K0, b);
				100
				101	LOOP_UNROLLING(int, i, 0, 1, M0,
				102	{
				103	a[i].v = 0;
				104	})
				105
				106	LOOP_UNROLLING(int, i, 0, 1, N0,
				107	{
				108	b[i].v = 0;
				109	})
				110
				111	// Load tile from the lhs tensor
				112	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				113
				114	// Load tile from the rhs tensor in a transposed fashion
				115	// in order to use T_MMUL_NT_T macro because only this macro
				116	// can utilize dot product instruction for Int8/UInt8 by
				117	// directly multiplying the rows of Lhs and Rhs tensors.
				118	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				119
				120	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				121
				122	LOOP_UNROLLING(int, i, 0, 1, M0,
				123	{
				124	LOOP_UNROLLING(int, j, 0, 1, K0,
				125	{
				126	a_sum[0].s[i] += (int)a[i].s[j];
				127	})
				128	})
				129
				130	LOOP_UNROLLING(int, i, 0, 1, K0,
				131	{
				132	LOOP_UNROLLING(int, j, 0, 1, N0,
				133	{
				134	b_sum[0].s[j] += (int)b[j].s[i];
				135	})
				136	})
				137
				138	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				139	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				140	}
				141
				142	#if((K % K0) != 0)
				143	/* Leftover Loop */
				144	for(; k < K; ++k)
				145	{
				146	TILE(DATA_TYPE, M0, 1, a);
				147	TILE(DATA_TYPE, N0, 1, b);
				148
				149	LOOP_UNROLLING(int, i, 0, 1, M0,
				150	{
				151	a[i].v = 0;
				152	})
				153
				154	LOOP_UNROLLING(int, i, 0, 1, N0,
				155	{
				156	b[i].v = 0;
				157	})
				158
				159	// Load tile from the lhs tensor
				160	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				161
				162	// Load tile from the rhs tensor in a transposed fashion.
				163	// See the main loop for more explanation
				164	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				165
				166	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				167
				168	LOOP_UNROLLING(int, i, 0, 1, M0,
				169	{
				170	LOOP_UNROLLING(int, j, 0, 1, 1,
				171	{
				172	a_sum[0].s[i] += (int)a[i].s[j];
				173	})
				174	})
				175
				176	LOOP_UNROLLING(int, i, 0, 1, 1,
				177	{
				178	LOOP_UNROLLING(int, j, 0, 1, N0,
				179	{
				180	b_sum[0].s[j] += (int)b[j].s[i];
				181	})
				182	})
				183
				184	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				185	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				186	}
				187	#endif // ((K % K0) != 0)
				188
				189	LOOP_UNROLLING(int, i, 0, 1, M0,
				190	{
				191	LOOP_UNROLLING(int, j, 0, 1, N0,
				192	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame]	193	acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	194	})
				195	})
				196
				197	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				198	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				199
				200	// Quantize the tile
				201	TILE(DATA_TYPE, M0, N0, accq);
				202	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				203
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	204	T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
				205
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	206	TILE(int, M0, 1, indirect_buffer);
				207	LOOP_UNROLLING(int, _i, 0, 1, M0,
				208	{
				209	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				210	});
				211
				212	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				213	}
				214	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
				215
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	216	#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				217	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
				218	*
				219	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				220	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				221	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				222	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				223	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	224	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	225	* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	226	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				227	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
				228	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				229	* - M0 > 0
				230	* - N0 = 1, 2, 3, 4, 8, 16
				231	* - K0 = 1, 2, 3, 4, 8, 16
				232	* @note Values > 8 for M0, N0, K0 are not expected to be efficient
				233	*
				234	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				235	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				236	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				237	* @param[in] lhs_w The width of the lhs tensor
				238	* @param[in] lhs_h The height of the lhs tensor
				239	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				240	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				241	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				242	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				243	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				244	* @param[in] rhs_w The width of the rhs tensor
				245	* @param[in] rhs_h The height of the rhs tensor
				246	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				247	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				248	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				249	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				250	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				251	* @param[in] dst_w The width of the dst tensor
				252	* @param[in] dst_h The height of the dst tensor
				253	* @param[in] dst_n Number of the matrices (buffers) in the batch
				254	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				255	*/
				256	__kernel void mat_mul_native_quantized_nt_t(
				257	TENSOR3D_T(lhs, BUFFER),
				258	TENSOR3D_T(rhs, BUFFER),
				259	TENSOR3D_T(dst, BUFFER))
				260	{
				261	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				262	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				263	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				264
				265	// Compute LHS/RHS/DST matrix address
				266	lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
				267	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				268	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				269
				270	// Initialize the accumulators
				271	TILE(int, M0, N0, acc);
				272	LOOP_UNROLLING(int, i, 0, 1, M0,
				273	{
				274	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				275	})
				276
				277	TILE(int, 1, M0, a_sum);
				278	a_sum[0].v = 0;
				279
				280	TILE(int, 1, N0, b_sum);
				281	b_sum[0].v = 0;
				282
				283	int k;
				284	for(k = 0; k <= K - K0; k += K0)
				285	{
				286	TILE(DATA_TYPE, M0, K0, a);
				287	TILE(DATA_TYPE, N0, K0, b);
				288
				289	LOOP_UNROLLING(int, i, 0, 1, M0,
				290	{
				291	a[i].v = 0;
				292	})
				293
				294	LOOP_UNROLLING(int, i, 0, 1, N0,
				295	{
				296	b[i].v = 0;
				297	})
				298
				299	// Load tile from lhs/rhs tensors
				300	T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				301	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				302
				303	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				304
				305	LOOP_UNROLLING(int, i, 0, 1, M0,
				306	{
				307	LOOP_UNROLLING(int, j, 0, 1, K0,
				308	{
				309	a_sum[0].s[i] += (int)a[i].s[j];
				310	})
				311	})
				312
				313	LOOP_UNROLLING(int, i, 0, 1, N0,
				314	{
				315	LOOP_UNROLLING(int, j, 0, 1, K0,
				316	{
				317	b_sum[0].s[i] += (int)b[i].s[j];
				318	})
				319	})
				320
				321	lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				322	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				323	}
				324
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	325	#if((K % K0) != 0)
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	326	// Leftover loop
				327	for(; k < K; ++k)
				328	{
				329	TILE(DATA_TYPE, M0, 1, a);
				330	TILE(DATA_TYPE, N0, 1, b);
				331
				332	LOOP_UNROLLING(int, i, 0, 1, M0,
				333	{
				334	a[i].v = 0;
				335	})
				336
				337	LOOP_UNROLLING(int, i, 0, 1, N0,
				338	{
				339	b[i].v = 0;
				340	})
				341
				342	// Load tile from lhs/rhs tensors
				343	T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				344	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				345
				346	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				347
				348	LOOP_UNROLLING(int, i, 0, 1, M0,
				349	{
				350	LOOP_UNROLLING(int, j, 0, 1, 1,
				351	{
				352	a_sum[0].s[i] += (int)a[i].s[j];
				353	})
				354	})
				355
				356	LOOP_UNROLLING(int, i, 0, 1, N0,
				357	{
				358	LOOP_UNROLLING(int, j, 0, 1, 1,
				359	{
				360	b_sum[0].s[i] += (int)b[i].s[j];
				361	})
				362	})
				363
				364	lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				365	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				366	}
				367	#endif // ((K % K0) != 0)
				368
				369	LOOP_UNROLLING(int, i, 0, 1, M0,
				370	{
				371	LOOP_UNROLLING(int, j, 0, 1, N0,
				372	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame]	373	acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	374	})
				375	})
				376
				377	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				378	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				379
				380	// Quantize the tile
				381	TILE(DATA_TYPE, M0, N0, accq);
				382	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				383
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	384	T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
				385
Jakub Sujak	5e99a3e	2023-04-18 08:33:56 +0100	[diff] [blame]	386	TILE(int, M0, 1, indirect_buffer);
				387	LOOP_UNROLLING(int, _i, 0, 1, M0,
				388	{
				389	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				390	});
				391
				392	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				393	}
				394	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
				395
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	396	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
				397	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
				398	*
				399	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				400	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				401	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				402	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				403	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	404	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	405	* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	406	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				407	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
				408	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				409	* - M0 > 0
				410	* - N0 = 1, 2, 3, 4, 8, 16
				411	* - K0 = 1, 2, 3, 4, 8, 16
				412	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				413	*
				414	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				415	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				416	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				417	* @param[in] lhs_w The width of the lhs tensor
				418	* @param[in] lhs_h The height of the lhs tensor
				419	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				420	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				421	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				422	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				423	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				424	* @param[in] rhs_w The width of the rhs tensor
				425	* @param[in] rhs_h The height of the rhs tensor
				426	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				427	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				428	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				429	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				430	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				431	* @param[in] dst_w The width of the dst tensor
				432	* @param[in] dst_h The height of the dst tensor
				433	* @param[in] dst_n Number of the matrices (buffers) in the batch
				434	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				435	*/
				436	__kernel void mat_mul_native_quantized_t_nt(
				437	TENSOR3D_T(lhs, BUFFER),
				438	TENSOR3D_T(rhs, BUFFER),
				439	TENSOR3D_T(dst, BUFFER))
				440	{
				441	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				442	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				443	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				444
				445	// Compute LHS/RHS/DST matrix address
				446	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				447	rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
				448	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				449
				450	// Initialize the accumulators
				451	TILE(int, M0, N0, acc);
				452	LOOP_UNROLLING(int, i, 0, 1, M0,
				453	{
				454	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				455	})
				456
				457	TILE(int, 1, N0, b_sum);
				458	b_sum[0].v = 0;
				459
				460	TILE(int, 1, M0, a_sum);
				461	a_sum[0].v = 0;
				462
				463	int k;
				464	for(k = 0; k <= K - K0; k += K0)
				465	{
				466	TILE(DATA_TYPE, M0, K0, a);
				467	TILE(DATA_TYPE, N0, K0, b);
				468
				469	LOOP_UNROLLING(int, i, 0, 1, M0,
				470	{
				471	a[i].v = 0;
				472	})
				473
				474	LOOP_UNROLLING(int, i, 0, 1, N0,
				475	{
				476	b[i].v = 0;
				477	})
				478
				479	// Load tile from the lhs/rhs tensors in a transposed fashion
				480	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				481	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				482	T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				483
				484	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				485
				486	LOOP_UNROLLING(int, i, 0, 1, K0,
				487	{
				488	LOOP_UNROLLING(int, j, 0, 1, M0,
				489	{
				490	a_sum[0].s[j] += (int)a[j].s[i];
				491	})
				492	})
				493
				494	LOOP_UNROLLING(int, i, 0, 1, K0,
				495	{
				496	LOOP_UNROLLING(int, j, 0, 1, N0,
				497	{
				498	b_sum[0].s[j] += (int)b[j].s[i];
				499	})
				500	})
				501
				502	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				503	rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
				504	}
				505
				506	#if((K % K0) != 0)
				507	/* Leftover Loop */
				508	for(; k < K; ++k)
				509	{
				510	TILE(DATA_TYPE, M0, 1, a);
				511	TILE(DATA_TYPE, N0, 1, b);
				512
				513	LOOP_UNROLLING(int, i, 0, 1, M0,
				514	{
				515	a[i].v = 0;
				516	})
				517
				518	LOOP_UNROLLING(int, i, 0, 1, N0,
				519	{
				520	b[i].v = 0;
				521	})
				522
				523	// Load tile from the lhs/rhs tensors in a transposed fashion
				524	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				525	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				526	T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				527
				528	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				529
				530	LOOP_UNROLLING(int, i, 0, 1, 1,
				531	{
				532	LOOP_UNROLLING(int, j, 0, 1, M0,
				533	{
				534	a_sum[0].s[j] += (int)a[j].s[i];
				535	})
				536	})
				537
				538	LOOP_UNROLLING(int, i, 0, 1, 1,
				539	{
				540	LOOP_UNROLLING(int, j, 0, 1, N0,
				541	{
				542	b_sum[0].s[j] += (int)b[j].s[i];
				543	})
				544	})
				545
				546	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				547	rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
				548	}
				549	#endif // ((K % K0) != 0)
				550
				551	LOOP_UNROLLING(int, i, 0, 1, M0,
				552	{
				553	LOOP_UNROLLING(int, j, 0, 1, N0,
				554	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame]	555	acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	556	})
				557	})
				558
				559	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				560	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				561
				562	// Quantize the tile
				563	TILE(DATA_TYPE, M0, N0, accq);
				564	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				565
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	566	T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
				567
Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	568	TILE(int, M0, 1, indirect_buffer);
				569	LOOP_UNROLLING(int, _i, 0, 1, M0,
				570	{
				571	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				572	});
				573
				574	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				575	}
				576	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	577
				578	#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
				579	/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
				580	*
				581	* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
				582	* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
				583	* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
				584	* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
				585	* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi	94abde4	2023-05-25 16:48:43 +0100	[diff] [blame]	586	* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	587	* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	588	* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
				589	* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
				590	* @note Only the following configurations of M0, N0 and K0 are currently supported:
				591	* - M0 = 1, 2, 3, 4, 8, 16
				592	* - N0 = 1, 2, 3, 4, 8, 16
				593	* - K0 = 1, 2, 3, 4, 8, 16
				594	* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
				595	*
				596	* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
				597	* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
				598	* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
				599	* @param[in] lhs_w The width of the lhs tensor
				600	* @param[in] lhs_h The height of the lhs tensor
				601	* @param[in] lhs_n Number of the matrices (buffers) in the batch
				602	* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
				603	* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
				604	* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
				605	* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
				606	* @param[in] rhs_w The width of the rhs tensor
				607	* @param[in] rhs_h The height of the rhs tensor
				608	* @param[in] rhs_n Number of the matrices (buffers) in the batch
				609	* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
				610	* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
				611	* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
				612	* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
				613	* @param[in] dst_w The width of the dst tensor
				614	* @param[in] dst_h The height of the dst tensor
				615	* @param[in] dst_n Number of the matrices (buffers) in the batch
				616	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
				617	*/
				618	__kernel void mat_mul_native_quantized_t_t(
				619	TENSOR3D_T(lhs, BUFFER),
				620	TENSOR3D_T(rhs, BUFFER),
				621	TENSOR3D_T(dst, BUFFER))
				622	{
				623	const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
				624	const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
				625	const uint z = GET_SPATIAL_IDX(2, 1, 0);
				626
				627	// Compute LHS/RHS/DST matrix address
				628	lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
				629	rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
				630	dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
				631
				632	// Initialize the accumulators
				633	TILE(int, M0, N0, acc);
				634	LOOP_UNROLLING(int, i, 0, 1, M0,
				635	{
				636	acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
				637	})
				638
				639	TILE(int, 1, M0, a_sum);
				640	a_sum[0].v = 0;
				641
				642	TILE(int, 1, N0, b_sum);
				643	b_sum[0].v = 0;
				644
				645	int k;
				646	for(k = 0; k <= K - K0; k += K0)
				647	{
				648	TILE(DATA_TYPE, M0, K0, a);
				649	TILE(DATA_TYPE, N0, K0, b);
				650
				651	LOOP_UNROLLING(int, i, 0, 1, M0,
				652	{
				653	a[i].v = 0;
				654	})
				655
				656	LOOP_UNROLLING(int, i, 0, 1, N0,
				657	{
				658	b[i].v = 0;
				659	})
				660
				661	// Load tile from the lhs tensor in a transposed fashion
				662	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				663	T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				664
				665	// Load tile from the rhs tensor
				666	T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				667
				668	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
				669
				670	LOOP_UNROLLING(int, i, 0, 1, K0,
				671	{
				672	LOOP_UNROLLING(int, j, 0, 1, M0,
				673	{
				674	a_sum[0].s[j] += (int)a[j].s[i];
				675	})
				676	})
				677
				678	LOOP_UNROLLING(int, i, 0, 1, N0,
				679	{
				680	LOOP_UNROLLING(int, j, 0, 1, K0,
				681	{
				682	b_sum[0].s[i] += (int)b[i].s[j];
				683	})
				684	})
				685
				686	lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
				687	rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
				688	}
				689
				690	#if((K % K0) != 0)
				691	/* Leftover Loop */
				692	for(; k < K; ++k)
				693	{
				694	TILE(DATA_TYPE, M0, 1, a);
				695	TILE(DATA_TYPE, N0, 1, b);
				696
				697	LOOP_UNROLLING(int, i, 0, 1, M0,
				698	{
				699	a[i].v = 0;
				700	})
				701
				702	LOOP_UNROLLING(int, i, 0, 1, N0,
				703	{
				704	b[i].v = 0;
				705	})
				706
				707	// Load tile from the lhs tensor in a transposed fashion
				708	// see mat_mul_native_quantized_nt_nt main loop for more explanation
				709	T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
				710
				711	// Load tile from the rhs tensor
				712	T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
				713
				714	T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
				715
				716	LOOP_UNROLLING(int, i, 0, 1, 1,
				717	{
				718	LOOP_UNROLLING(int, j, 0, 1, M0,
				719	{
				720	a_sum[0].s[j] += (int)a[j].s[i];
				721	})
				722	})
				723
				724	LOOP_UNROLLING(int, i, 0, 1, N0,
				725	{
				726	LOOP_UNROLLING(int, j, 0, 1, 1,
				727	{
				728	b_sum[0].s[i] += (int)b[i].s[j];
				729	})
				730	})
				731
				732	lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
				733	rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
				734	}
				735	#endif // ((K % K0) != 0)
				736
				737	LOOP_UNROLLING(int, i, 0, 1, M0,
				738	{
				739	LOOP_UNROLLING(int, j, 0, 1, N0,
				740	{
Mohammed Suhail Munshi	a2bb80e	2023-06-19 14:57:57 +0100	[diff] [blame]	741	acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	742	})
				743	})
				744
				745	const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
				746	const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
				747
				748	// Quantize the tile
				749	TILE(DATA_TYPE, M0, N0, accq);
				750	T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
				751
Mohammed Suhail Munshi	c9eeee5	2023-06-30 15:43:29 +0100	[diff] [blame^]	752	T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
				753
Omar Al Khatib	467daef	2023-04-13 14:56:23 +0100	[diff] [blame]	754	TILE(int, M0, 1, indirect_buffer);
				755	LOOP_UNROLLING(int, _i, 0, 1, M0,
				756	{
				757	indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
				758	});
				759
				760	T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
				761	}
				762	#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)