Blame - src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp - ml/ComputeLibrary

blob: cc98845e0f4690cf7e7a477189bc13fdc2ad0c01 [file] [log] [blame]

Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2019-2020 Arm Limited.
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
				25
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLKernelLibrary.h"
				28	#include "arm_compute/core/CL/ICLTensor.h"
				29	#include "arm_compute/core/CL/OpenCL.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	30	#include "arm_compute/core/Helpers.h"
				31	#include "arm_compute/core/TensorInfo.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	34	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	35	#include "src/core/AccessWindowStatic.h"
				36	#include "src/core/helpers/AutoConfiguration.h"
				37	#include "src/core/helpers/WindowHelpers.h"
Matthew Bentham	758b5ba	2020-03-05 23:37:48 +0000	[diff] [blame]	38	#include "support/StringSupport.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	39
				40	#include <cstddef>
				41	#include <cstdint>
				42	#include <tuple>
				43
				44	namespace arm_compute
				45	{
				46	using namespace misc::shape_calculator;
				47
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	48	namespace
				49	{
				50	using ElementsProcessed = Steps;
				51
				52	Status validate_arguments(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				53	const GEMMReshapeInfo &gemm_info)
				54	{
				55	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
Manuel Bottini	959c26d	2019-12-02 16:22:35 +0000	[diff] [blame]	56	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
SiCong Li	a208a80	2020-05-12 15:46:29 +0100	[diff] [blame]	57	if(input0->data_type() == DataType::QASYMM8)
				58	{
				59	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
				60	}
				61	else
				62	{
Sheri Zhang	42550c0	2020-07-06 13:48:11 +0100	[diff] [blame]	63	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
SiCong Li	a208a80	2020-05-12 15:46:29 +0100	[diff] [blame]	64	}
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	65	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
				66	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
				67	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
				68	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
				69	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
Gian Marco Iodice	06be6f8	2019-06-24 17:47:51 +0100	[diff] [blame]	70	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 \|\| lhs_info.m0 > 8);
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	71	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
Gian Marco Iodice	dd717c3	2020-05-28 10:22:03 +0100	[diff] [blame]	72	ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	73
				74	const int m = gemm_info.m();
				75	const int n = gemm_info.n();
				76	const int k = gemm_info.k();
				77
				78	ARM_COMPUTE_UNUSED(m);
				79	ARM_COMPUTE_UNUSED(n);
				80	ARM_COMPUTE_UNUSED(k);
				81
				82	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
				83	ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
				84	ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
				85	if(gemm_info.reinterpret_input_as_3d())
				86	{
				87	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
				88	}
				89	else
				90	{
				91	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
				92	}
				93
				94	if(output->total_size() != 0)
				95	{
				96	const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(input0, input1, gemm_info));
				97	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
				98	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
				99	}
				100
				101	return Status{};
				102	}
				103
				104	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input0, ITensorInfo input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				105	const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
				106	{
				107	unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
				108	unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
				109	bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
				110	bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
				111
				112	Window win{};
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	113
				114	// In case both input and output have to be reinterpreted as 3D tensors,
				115	// force reinterpret_output_as_3d to be false.
				116	if(reinterpret_input_as_3d == reinterpret_output_as_3d)
				117	{
				118	reinterpret_output_as_3d = false;
				119	}
				120
				121	// Output tensor auto initialization if not yet initialized
				122	auto_init_if_empty(output, input0->clone()->set_tensor_shape(compute_mm_shape(input0, *input1, gemm_info)).set_data_type(DataType::S32));
				123
				124	TensorInfo tmp_info(*output);
				125
				126	if(reinterpret_output_as_3d)
				127	{
				128	// Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
				129	// the window needs to be constructed on the 2D collapsed version of the tensor
				130	TensorShape tmp_shape(output->tensor_shape());
				131	tmp_shape.collapse(2U, 1U);
				132	tmp_info.set_tensor_shape(tmp_shape);
				133	}
				134
				135	// Configure kernel window
				136	num_elems_processed_per_iteration_x = rhs_info.n0;
				137	num_elems_processed_per_iteration_y = lhs_info.m0;
				138
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	139	win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	140	output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	141
				142	// Collapse along the Z direction
				143	// This collapse needs to be here in order to tune the Z dimension of LWS
				144	Window collapsed = win;
				145	const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
				146	collapsed = win.collapse(win, dimension_to_collapse);
				147
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	148	return std::make_pair(Status(), collapsed);
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	149	}
				150	} // namespace
				151
				152	CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
				153	: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
				154	{
				155	}
				156
				157	void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor input0, const ICLTensor input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				158	const GEMMReshapeInfo &gemm_info)
				159	{
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	160	configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
				161	}
				162
Manuel Bottini	679fc96	2020-04-21 16:08:53 +0100	[diff] [blame]	163	void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor input0, const ICLTensor input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	164	const GEMMRHSMatrixInfo &rhs_info,
				165	const GEMMReshapeInfo &gemm_info)
				166	{
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	167	ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
				168
				169	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
				170
				171	_input0 = input0;
				172	_input1 = input1;
				173	_output = output;
				174	_reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
				175	_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
				176	_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
				177
				178	// In case both input and output have to be reinterpreted as 3D tensors,
				179	// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
				180	if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
				181	{
				182	_reinterpret_input_as_3d = false;
				183	_reinterpret_output_as_3d = false;
				184	}
				185
				186	// Check if we need to slide the matrix B
				187	const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
				188	_slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
				189
				190	ElementsProcessed num_elements_processed{};
				191
				192	// Configure kernel window
				193	auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
				194	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				195	ICLKernel::configure_internal(win_config.second);
				196
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	197	// If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
				198	// we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
				199	// This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
				200	const unsigned int internal_m = input0->info()->dimension(1);
				201	// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
				202	const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
				203	const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
				204
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	205	// Create build options
				206	CLBuildOptions build_opts;
				207	build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
				208	build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
				209	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
				210	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
				211	build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
				212	build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
				213	build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
				214	build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
				215	build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
				216	build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
				217	build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
				218	build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
Michele Di Giorgio	f9179d3	2019-11-27 16:17:30 +0000	[diff] [blame]	219	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
				220	build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	221	build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
				222	build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	223	std::string kernel_name("gemmlowp_mm_native");
				224
				225	// Create kernel
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	226	_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	227
				228	// Set config_id for enabling LWS tuning
				229	_config_id = kernel_name;
				230	_config_id += "_";
				231	_config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
				232	_config_id += "_";
				233	_config_id += (_reinterpret_input_as_3d ? "3di_" : "");
				234	_config_id += (_reinterpret_output_as_3d ? "3do_" : "");
				235	_config_id += support::cpp11::to_string(output->info()->dimension(1));
				236	_config_id += "_";
				237	_config_id += support::cpp11::to_string(output->info()->dimension(0));
				238	_config_id += "_";
				239	_config_id += support::cpp11::to_string(gemm_info.k());
				240	_config_id += "_";
				241	_config_id += support::cpp11::to_string(output->info()->dimension(2));
				242	_config_id += "_";
				243	_config_id += support::cpp11::to_string(lhs_info.m0);
				244	_config_id += "_";
				245	_config_id += support::cpp11::to_string(rhs_info.n0);
				246	_config_id += "_";
				247	_config_id += support::cpp11::to_string(lhs_info.k0);
				248	}
				249
				250	Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
				251	const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
				252	{
				253	ElementsProcessed num_elements_processed{};
				254	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
				255	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
				256	input1->clone().get(),
				257	output->clone().get(),
				258	lhs_info,
				259	rhs_info,
				260	gemm_info,
				261	num_elements_processed)
				262	.first);
				263
				264	return Status{};
				265	}
				266
				267	void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
				268	{
				269	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				270	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				271
				272	if(_input1->info()->num_dimensions() < 3)
				273	{
				274	// The stride_z for matrix B must be zero if we do not slice
				275	ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
				276	}
				277
				278	Window slice = window.first_slice_window_3D();
				279	Window slice_matrix_b = slice;
				280
				281	slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
				282	slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
				283
				284	if(_reinterpret_input_as_3d)
				285	{
				286	// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
				287	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
				288	const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
				289	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
				290	}
				291
				292	if(_reinterpret_output_as_3d)
				293	{
				294	// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
				295	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
				296	const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
				297	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
				298	}
				299
				300	do
				301	{
				302	Window slice_b = slice;
				303	// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
				304	// This scenario can happen when the matrix multiplication is used to perform a convolution operation
				305	if(!_slide_matrix_b)
				306	{
				307	slice_b = slice_matrix_b;
				308	}
				309
				310	unsigned int idx = 0;
				311	add_2D_tensor_argument(idx, _input0, slice);
				312	add_2D_tensor_argument(idx, _input1, slice_b);
				313	add_2D_tensor_argument(idx, _output, slice);
				314	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
				315	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
				316	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
				317	enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
				318	}
				319	while(window.slide_window_slice_3D(slice));
				320	}
Matthew Bentham	758b5ba	2020-03-05 23:37:48 +0000	[diff] [blame]	321	} // namespace arm_compute