Blame - src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp - ml/ComputeLibrary

blob: 9215fd602de14226da4480c99b1c9da513f828b2 [file] [log] [blame]

Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	1	/*
Giorgio Arena	9f7d55a	2021-02-08 13:20:24 +0000	[diff] [blame]	2	* Copyright (c) 2019-2021 Arm Limited.
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Sang-Hoon Park	bef7fa2	2020-10-21 15:58:54 +0100	[diff] [blame]	24	#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	25
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLKernelLibrary.h"
				28	#include "arm_compute/core/CL/ICLTensor.h"
				29	#include "arm_compute/core/CL/OpenCL.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	30	#include "arm_compute/core/Helpers.h"
				31	#include "arm_compute/core/TensorInfo.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	34	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	35	#include "src/core/AccessWindowStatic.h"
				36	#include "src/core/helpers/AutoConfiguration.h"
				37	#include "src/core/helpers/WindowHelpers.h"
Matthew Bentham	758b5ba	2020-03-05 23:37:48 +0000	[diff] [blame]	38	#include "support/StringSupport.h"
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	39
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	40	namespace arm_compute
				41	{
				42	using namespace misc::shape_calculator;
				43
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	44	namespace
				45	{
				46	using ElementsProcessed = Steps;
				47
				48	Status validate_arguments(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				49	const GEMMReshapeInfo &gemm_info)
				50	{
				51	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
Manuel Bottini	959c26d	2019-12-02 16:22:35 +0000	[diff] [blame]	52	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
SiCong Li	a208a80	2020-05-12 15:46:29 +0100	[diff] [blame]	53	if(input0->data_type() == DataType::QASYMM8)
				54	{
				55	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
				56	}
				57	else
				58	{
Sheri Zhang	42550c0	2020-07-06 13:48:11 +0100	[diff] [blame]	59	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
SiCong Li	a208a80	2020-05-12 15:46:29 +0100	[diff] [blame]	60	}
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	61	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
				62	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
				63	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
				64	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
				65	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
Gian Marco Iodice	06be6f8	2019-06-24 17:47:51 +0100	[diff] [blame]	66	ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 \|\| lhs_info.m0 > 8);
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	67	ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
Gian Marco Iodice	dd717c3	2020-05-28 10:22:03 +0100	[diff] [blame]	68	ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	69
				70	const int m = gemm_info.m();
				71	const int n = gemm_info.n();
				72	const int k = gemm_info.k();
				73
				74	ARM_COMPUTE_UNUSED(m);
				75	ARM_COMPUTE_UNUSED(n);
				76	ARM_COMPUTE_UNUSED(k);
				77
				78	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
				79	ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
				80	ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
				81	if(gemm_info.reinterpret_input_as_3d())
				82	{
				83	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
				84	}
				85	else
				86	{
				87	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
				88	}
				89
				90	if(output->total_size() != 0)
				91	{
				92	const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(input0, input1, gemm_info));
				93	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
				94	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
				95	}
				96
				97	return Status{};
				98	}
				99
				100	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input0, ITensorInfo input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				101	const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
				102	{
				103	unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
				104	unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
				105	bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
				106	bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
				107
				108	Window win{};
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	109	bool window_changed = false;
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	110
				111	// In case both input and output have to be reinterpreted as 3D tensors,
				112	// force reinterpret_output_as_3d to be false.
				113	if(reinterpret_input_as_3d == reinterpret_output_as_3d)
				114	{
				115	reinterpret_output_as_3d = false;
				116	}
				117
				118	// Output tensor auto initialization if not yet initialized
				119	auto_init_if_empty(output, input0->clone()->set_tensor_shape(compute_mm_shape(input0, *input1, gemm_info)).set_data_type(DataType::S32));
				120
				121	TensorInfo tmp_info(*output);
				122
				123	if(reinterpret_output_as_3d)
				124	{
				125	// Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
				126	// the window needs to be constructed on the 2D collapsed version of the tensor
				127	TensorShape tmp_shape(output->tensor_shape());
				128	tmp_shape.collapse(2U, 1U);
				129	tmp_info.set_tensor_shape(tmp_shape);
				130	}
				131
				132	// Configure kernel window
				133	num_elems_processed_per_iteration_x = rhs_info.n0;
				134	num_elems_processed_per_iteration_y = lhs_info.m0;
				135
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	136	win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	137
				138	// RHS matrix still needs padding on the X
				139	AccessWindowStatic input1_access(input1, 0, 0,
				140	ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
				141	input1->dimension(1));
				142
				143	window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	144
				145	// Collapse along the Z direction
				146	// This collapse needs to be here in order to tune the Z dimension of LWS
				147	Window collapsed = win;
				148	const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
				149	collapsed = win.collapse(win, dimension_to_collapse);
				150
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	151	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				152	return std::make_pair(err, collapsed);
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	153	}
				154	} // namespace
				155
				156	CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
				157	: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
				158	{
				159	}
				160
				161	void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor input0, const ICLTensor input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
				162	const GEMMReshapeInfo &gemm_info)
				163	{
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	164	configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
				165	}
				166
Manuel Bottini	679fc96	2020-04-21 16:08:53 +0100	[diff] [blame]	167	void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor input0, const ICLTensor input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	168	const GEMMRHSMatrixInfo &rhs_info,
				169	const GEMMReshapeInfo &gemm_info)
				170	{
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	171	ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
				172
				173	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
				174
				175	_input0 = input0;
				176	_input1 = input1;
				177	_output = output;
				178	_reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
				179	_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
				180	_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
				181
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	182	// We still need padding on the X dimension for the RHS matrix
				183	auto padding_info = get_padding_info({ input0, output });
				184
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	185	// In case both input and output have to be reinterpreted as 3D tensors,
				186	// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
				187	if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
				188	{
				189	_reinterpret_input_as_3d = false;
				190	_reinterpret_output_as_3d = false;
				191	}
				192
				193	// Check if we need to slide the matrix B
				194	const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
				195	_slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
				196
				197	ElementsProcessed num_elements_processed{};
				198
				199	// Configure kernel window
				200	auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
				201	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				202	ICLKernel::configure_internal(win_config.second);
				203
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	204	// If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
				205	// we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
				206	// This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	207	const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	208	// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
				209	const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
				210	const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
				211
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	212	// Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
				213	// NOTE: This might have implications on heuristics and performance
				214	const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
				215
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	216	// Create build options
				217	CLBuildOptions build_opts;
				218	build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
				219	build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
				220	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
				221	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
				222	build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
				223	build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
				224	build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
				225	build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
				226	build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	227	build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	228	build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
				229	build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
Michele Di Giorgio	f9179d3	2019-11-27 16:17:30 +0000	[diff] [blame]	230	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
				231	build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
morgolock	cf343e3	2020-10-12 14:00:43 +0100	[diff] [blame]	232	build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
				233	build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	234	std::string kernel_name("gemmlowp_mm_native");
				235
				236	// Create kernel
Manuel Bottini	4c6bd51	2020-04-08 10:15:51 +0100	[diff] [blame]	237	_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	238
				239	// Set config_id for enabling LWS tuning
				240	_config_id = kernel_name;
				241	_config_id += "_";
				242	_config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
				243	_config_id += "_";
				244	_config_id += (_reinterpret_input_as_3d ? "3di_" : "");
				245	_config_id += (_reinterpret_output_as_3d ? "3do_" : "");
				246	_config_id += support::cpp11::to_string(output->info()->dimension(1));
				247	_config_id += "_";
				248	_config_id += support::cpp11::to_string(output->info()->dimension(0));
				249	_config_id += "_";
				250	_config_id += support::cpp11::to_string(gemm_info.k());
				251	_config_id += "_";
				252	_config_id += support::cpp11::to_string(output->info()->dimension(2));
				253	_config_id += "_";
				254	_config_id += support::cpp11::to_string(lhs_info.m0);
				255	_config_id += "_";
				256	_config_id += support::cpp11::to_string(rhs_info.n0);
				257	_config_id += "_";
				258	_config_id += support::cpp11::to_string(lhs_info.k0);
Michele Di Giorgio	27d92fd	2020-10-27 12:44:17 +0000	[diff] [blame]	259
				260	ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	261	}
				262
				263	Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
				264	const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
				265	{
				266	ElementsProcessed num_elements_processed{};
				267	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
				268	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
				269	input1->clone().get(),
				270	output->clone().get(),
				271	lhs_info,
				272	rhs_info,
				273	gemm_info,
				274	num_elements_processed)
				275	.first);
				276
				277	return Status{};
				278	}
				279
				280	void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
				281	{
				282	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				283	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				284
				285	if(_input1->info()->num_dimensions() < 3)
				286	{
Giorgio Arena	edc524e	2021-02-10 11:54:47 +0000	[diff] [blame^]	287	// The stride_z for matrix B must be zero if we do not slice
				288	ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
Gian Marco Iodice	e751062	2019-06-03 17:28:17 +0100	[diff] [blame]	289	}
				290
				291	Window slice = window.first_slice_window_3D();
				292	Window slice_matrix_b = slice;
				293
				294	slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
				295	slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
				296
				297	if(_reinterpret_input_as_3d)
				298	{
				299	// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
				300	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
				301	const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
				302	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
				303	}
				304
				305	if(_reinterpret_output_as_3d)
				306	{
				307	// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
				308	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
				309	const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
				310	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
				311	}
				312
				313	do
				314	{
				315	Window slice_b = slice;
				316	// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
				317	// This scenario can happen when the matrix multiplication is used to perform a convolution operation
				318	if(!_slide_matrix_b)
				319	{
				320	slice_b = slice_matrix_b;
				321	}
				322
				323	unsigned int idx = 0;
				324	add_2D_tensor_argument(idx, _input0, slice);
				325	add_2D_tensor_argument(idx, _input1, slice_b);
				326	add_2D_tensor_argument(idx, _output, slice);
				327	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
				328	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
				329	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
				330	enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
				331	}
				332	while(window.slide_window_slice_3D(slice));
				333	}
Matthew Bentham	758b5ba	2020-03-05 23:37:48 +0000	[diff] [blame]	334	} // namespace arm_compute