Blame - src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

blob: 3143075a9d7aafcb797e5ff692a60bf2a0026e62 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowStatic.h"
				27	#include "arm_compute/core/AccessWindowTranspose.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	28	#include "arm_compute/core/CL/CLHelpers.h"
				29	#include "arm_compute/core/CL/CLKernelLibrary.h"
				30	#include "arm_compute/core/CL/ICLTensor.h"
				31	#include "arm_compute/core/CL/OpenCL.h"
				32	#include "arm_compute/core/Error.h"
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	33	#include "arm_compute/core/FixedPoint.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	34	#include "arm_compute/core/Helpers.h"
Isabella Gottardi	d56e770	2018-02-28 14:29:36 +0000	[diff] [blame^]	35	#include "arm_compute/core/TensorInfo.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	36	#include "arm_compute/core/Types.h"
				37	#include "arm_compute/core/Utils.h"
				38	#include "arm_compute/core/Validate.h"
				39	#include "arm_compute/core/Window.h"
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	40	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	41
				42	#include <set>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	43	#include <string>
				44
				45	using namespace arm_compute;
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	46	using namespace arm_compute::misc::shape_calculator;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	47
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	48	namespace
				49	{
				50	using ElementsProcessed = Steps;
				51
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	52	inline Status validate_arguments(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	53	{
Georgios Pinitas	78c0090	2018-01-09 17:33:11 +0000	[diff] [blame]	54	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	55	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	56	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
				57	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
				58
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	59	if(!is_interleaved_transposed)
				60	{
				61	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	62
				63	if(output->total_size() != 0)
				64	{
				65	ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
				66	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
				67	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
				68	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
				69	}
				70	}
				71	else
				72	{
				73	const int m = reshape_info.m();
				74	const int n = reshape_info.n();
				75	const int k = reshape_info.k();
				76	const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
				77	const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
				78
				79	TensorShape tensor_shape0{ input0->tensor_shape() };
				80	tensor_shape0.set(0, k);
				81	tensor_shape0.set(1, m);
				82
				83	TensorShape tensor_shape1{ input1->tensor_shape() };
				84	tensor_shape1.set(0, n);
				85	tensor_shape1.set(1, k);
				86
				87	const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
				88	const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
				89
				90	const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
				91	const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
				92
				93	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
				94	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
				95
				96	if(output->total_size() != 0)
				97	{
				98	ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
				99	ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
				100	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
				101	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
				102	}
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	103	}
				104
				105	return Status{};
				106	}
				107
				108	inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo input0, ITensorInfo input1, ITensorInfo *output,
				109	bool is_interleaved_transposed, GPUTarget gpu_target,
				110	ElementsProcessed &num_elements_processed)
				111	{
				112	bool window_changed = false;
				113	Window win{};
				114
				115	const DataType data_type = input0->data_type();
				116	unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
				117	unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
				118
				119	if(is_interleaved_transposed)
				120	{
				121	// Configure kernel window
				122	num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
				123	num_elems_processed_per_iteration_y = 4;
				124
				125	win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				126
				127	AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
				128	AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
				129	AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
				130
				131	window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
				132
				133	output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
				134	}
				135	else // The input tensors have not been reshaped
				136	{
				137	// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
				138	num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
				139	num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
				140
				141	// Create kernels according to the architecture, data type and input size.
				142	if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
				143	{
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	144	num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	145	}
				146
				147	// Configure window
				148	win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				149
				150	AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
				151	AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
				152	AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
				153
				154	window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
				155
				156	Coordinates coord;
				157	coord.set_num_dimensions(output->num_dimensions());
				158	output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
				159	}
				160
				161	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				162	return std::make_pair(err, win);
				163	}
				164	} // namespace
				165
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	166	CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
				167	: _input0(nullptr), _input1(nullptr), _output(nullptr)
				168	{
				169	}
				170
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	171	void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor input0, const ICLTensor input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	172	{
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	173	ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
				174
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	175	// Output tensor auto inizialitation if not yet initialized
				176	TensorShape tensor_shape{ input0->info()->tensor_shape() };
				177	tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
				178	tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
				179
				180	auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
				181
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	182	// Perform validate step
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	183	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	184
				185	_input0 = input0;
				186	_input1 = input1;
				187	_output = output;
				188
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	189	const DataType data_type = input0->info()->data_type();
				190	const int fp_pos = input0->info()->fixed_point_position();
				191
				192	// Get target architecture
				193	GPUTarget arch_target = get_arch_from_target(get_target());
				194
				195	// Configure LWS hint
Anthony Barbier	fcd52fb	2017-11-28 10:31:43 +0000	[diff] [blame]	196	if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
				197	{
				198	// LWS optimized for the 11x11 AlexNet convolution on Bifrost.
				199	_lws_hint = cl::NDRange(2, 2);
				200	}
				201	else if(output->info()->dimension(1) == 196)
				202	{
				203	_lws_hint = cl::NDRange(1, 7);
				204	}
				205	else
				206	{
				207	_lws_hint = cl::NDRange(8, 8);
				208	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	209
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	210	ElementsProcessed num_elements_processed{};
				211
				212	// Configure kernel window
				213	auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
				214	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				215	ICLKernel::configure(win_config.second);
				216
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	217	// Create build options
				218	CLBuildOptions build_opts;
				219	build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
				220
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	221	// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	222	if(std::abs(1.0f - alpha) > 0.00001f)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	224	build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
				225	"-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
				226	"-DALPHA=" + float_to_string_with_full_precision(alpha));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	227	}
				228
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	229	std::string kernel_name;
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	230	if(is_interleaved_transposed)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	231	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	232	const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
				233	const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
				234
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	235	build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	236	build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
				237	build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
				238
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	239	if(data_type == DataType::F32)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	240	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	241	kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	242	}
				243	else
				244	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	245	kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	246	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	247	}
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	248	else // The input tensors have not been reshaped
				249	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	250	build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	251
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	252	// Create kernels according to the architecture, data type and input size.
				253	if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	254	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	255	// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
				256	// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
				257	// FC6 and FC7 of AlexNet and VGG-16).
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	258	kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	259
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	260	// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
				261	// via exhaustive autotuning over a range of representative layer configurations.
				262	_lws_hint = cl::NDRange(4);
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	263	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	264	else if(is_data_type_fixed_point(data_type))
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	265	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	266	kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	267	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	268	else // (MIDGARD and F32) or (F16)
				269	{
				270	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
				271	kernel_name = "gemm_mm_floating_point";
				272	}
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	273	build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
				274	build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	275	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	276
				277	// Create kernel
				278	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				279
				280	// Set config_id for enabling LWS tuning
				281	_config_id = "gemm_";
				282	_config_id += (is_interleaved_transposed ? "reshaped_" : "");
				283	_config_id += lower_string(string_from_data_type(input0->info()->data_type()));
				284	_config_id += "_";
				285	_config_id += support::cpp11::to_string(output->info()->dimension(1));
				286	_config_id += "_";
				287	_config_id += support::cpp11::to_string(output->info()->dimension(0));
				288	_config_id += "_";
				289	_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	290	}
				291
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	292	Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
				293	const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	294	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	295	// Note: num_elements_processed will be set in validate_and_configure_window()
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	296	ElementsProcessed num_elements_processed{};
				297	ARM_COMPUTE_UNUSED(alpha);
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	298	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	299	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
				300	input1->clone().get(),
				301	output->clone().get(),
				302	is_interleaved_transposed,
				303	gpu_target,
				304	num_elements_processed)
				305	.first);
				306
				307	return Status{};
				308	}
				309
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	310	void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
				311	{
				312	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				313	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				314
				315	Window slice = window.first_slice_window_2D();
				316	Window slice_matrix_b = slice;
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	317
				318	slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
				319	slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	320
				321	do
				322	{
				323	Window slice_b = slice;
				324	// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
				325	// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
				326	if(_input1->info()->num_dimensions() < 3)
				327	{
				328	slice_b = slice_matrix_b;
				329	}
				330
				331	unsigned int idx = 0;
				332	add_2D_tensor_argument(idx, _input0, slice);
				333	add_2D_tensor_argument(idx, _input1, slice_b);
				334	add_2D_tensor_argument(idx, _output, slice);
				335	enqueue(queue, *this, slice, _lws_hint);
				336	}
				337	while(window.slide_window_slice_2D(slice));
				338	}