Blame - src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp - ml/ComputeLibrary

blob: d5ecdf7dd27338ce3e5c7efc364968c44a9cdd04 [file] [log] [blame]

Gunes Bayir	9d0c4de	2023-04-13 18:22:58 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
				25
				26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/ICLTensor.h"
				28	#include "arm_compute/core/ITensorPack.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				31	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
				32
				33	#include "src/common/utils/Log.h"
				34	#include "src/core/helpers/AutoConfiguration.h"
				35	#include "src/core/helpers/WindowHelpers.h"
				36	#include "src/gpu/cl/ClCompileContext.h"
				37
				38	#include "support/Cast.h"
				39	#include "support/StringSupport.h"
				40
				41	namespace arm_compute
				42	{
				43	namespace opencl
				44	{
				45	namespace kernels
				46	{
				47	namespace
				48	{
				49	Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
				50	{
				51	const bool adj_lhs = matmul_kernel_info.adj_lhs;
				52	const bool adj_rhs = matmul_kernel_info.adj_rhs;
				53	const int m0 = matmul_kernel_info.m0;
				54	const int n0 = matmul_kernel_info.n0;
				55	const int k0 = matmul_kernel_info.k0;
				56
				57	// Validate M0
				58	ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
				59
				60	if(adj_lhs)
				61	{
				62	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) \|\| (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
				63	}
				64
				65	// Validate N0
				66	ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
				67	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) \|\| (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
				68
				69	// Validate K0
				70	ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
				71	if(!adj_lhs \|\| adj_rhs)
				72	{
				73	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) \|\| (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
				74	}
				75
				76	return Status{};
				77	}
				78
				79	Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
				80	{
				81	const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
				82	const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
				83
				84	ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
				85	ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
				86	ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
				87
				88	constexpr size_t batch_dim_start = 2;
				89	for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
				90	{
				91	ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
				92	}
				93
				94	return Status{};
				95	}
				96	}
				97	ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
				98	{
				99	_type = CLKernelType::GEMM;
				100	}
				101	Status ClMatMulLowpNativeKernel::validate(const ITensorInfo lhs, const ITensorInfo rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
				102	{
				103	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
				104	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
				105	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
				106	ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
				107	ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
				108
				109	if(output->total_size() != 0)
				110	{
				111	const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
				112	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
				113	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
				114	}
				115
				116	return Status{};
				117	}
				118	void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo lhs, ITensorInfo rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
				119	{
				120	ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
				121	ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
				122	ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
				123
				124	// output tensor auto initialization if not yet initialized
				125	auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
				126
				127	const int m = output->dimension(1);
				128	const int n = output->dimension(0);
				129	const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
				130	const bool adj_lhs = matmul_kernel_info.adj_lhs;
				131
				132	int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
				133	int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
				134
				135	// Configure kernel window
				136	Window win = calculate_max_window(*output, Steps(n0, m0));
				137	win = win.collapse(win, Window::DimZ);
				138	IClKernel::configure_internal(win);
				139
				140	// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
				141	const unsigned int partial_store_m0 = m % m0;
				142	const unsigned int partial_store_n0 = n % n0;
				143
				144	CLBuildOptions build_opts;
				145	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
				146	build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
				147	build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
				148	build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
				149	build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
				150	build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
				151	build_opts.add_option("-DK=" + support::cpp11::to_string(k));
				152
				153	const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
				154	const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
				155	const UniformQuantizationInfo dqinfo = output->quantization_info().uniform();
				156
				157	float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
				158	int output_multiplier = 0;
				159	int output_shift = 0;
				160	arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
				161
				162	build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
				163	build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
				164
				165	build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(-lqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
				166	build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
				167	build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset)); // Passed as positive (unlike the above two)
				168
				169	std::string kernel_name("mat_mul_native_quantized");
				170	kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
				171	kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
				172
				173	// A macro guard to compile ONLY the kernel of interest
				174	build_opts.add_option("-D" + upper_string(kernel_name));
				175
				176	// Create kernel
				177	_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
				178
				179	// Set config_id for enabling LWS tuning
				180	const size_t number_of_batches = output->tensor_shape().total_size() / (m * n);
				181
				182	_config_id = kernel_name;
				183	_config_id += "_";
				184	_config_id += lower_string(string_from_data_type(lhs->data_type()));
				185	_config_id += "_";
				186	_config_id += support::cpp11::to_string(m);
				187	_config_id += "_";
				188	_config_id += support::cpp11::to_string(n);
				189	_config_id += "_";
				190	_config_id += support::cpp11::to_string(k);
				191	_config_id += "_";
				192	_config_id += support::cpp11::to_string(number_of_batches);
				193	_config_id += "_";
				194	_config_id += support::cpp11::to_string(m0);
				195	_config_id += "_";
				196	_config_id += support::cpp11::to_string(n0);
				197	_config_id += "_";
				198	_config_id += support::cpp11::to_string(matmul_kernel_info.k0);
				199	}
				200
				201	void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
				202	{
				203	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				204	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				205
				206	const ICLTensor lhs = utils::cast::polymorphic_downcast<const ICLTensor >(tensors.get_const_tensor(TensorType::ACL_SRC_0));
				207	const ICLTensor rhs = utils::cast::polymorphic_downcast<const ICLTensor >(tensors.get_const_tensor(TensorType::ACL_SRC_1));
				208	ICLTensor output = utils::cast::polymorphic_downcast<ICLTensor >(tensors.get_tensor(TensorType::ACL_DST));
				209	ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
				210	ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
				211
				212	unsigned int idx = 0;
				213	Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
				214
				215	add_3d_tensor_nhw_argument(idx, lhs);
				216	add_3d_tensor_nhw_argument(idx, rhs);
				217	add_3d_tensor_nhw_argument(idx, output);
				218
				219	enqueue(queue, *this, window_collapsed, lws_hint());
				220	}
				221
				222	} // namespace kernels
				223	} // namespace opencl
				224	} // namespace arm_compute