Blame - src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

blob: c9ed7763da3e300d13eb718dbdfd5b15a7f359d1 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	27	#include "arm_compute/core/CL/CLHelpers.h"
				28	#include "arm_compute/core/CL/CLKernelLibrary.h"
Vidhya Sudhan Loganathan	f1f4906	2018-05-25 13:21:26 +0100	[diff] [blame]	29	#include "arm_compute/core/CL/CLValidate.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	#include "arm_compute/core/CL/ICLTensor.h"
				31	#include "arm_compute/core/CL/OpenCL.h"
				32	#include "arm_compute/core/Error.h"
				33	#include "arm_compute/core/Helpers.h"
Isabella Gottardi	d56e770	2018-02-28 14:29:36 +0000	[diff] [blame]	34	#include "arm_compute/core/TensorInfo.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	35	#include "arm_compute/core/Types.h"
				36	#include "arm_compute/core/Utils.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	37	#include "arm_compute/core/Window.h"
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	38	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	39
				40	#include <set>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	41	#include <string>
				42
				43	using namespace arm_compute;
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	44	using namespace arm_compute::misc::shape_calculator;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	45
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	46	namespace
				47	{
				48	using ElementsProcessed = Steps;
				49
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	50	inline Status validate_arguments(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
				51	bool fp_mixed_precision)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	52	{
Georgios Pinitas	78c0090	2018-01-09 17:33:11 +0000	[diff] [blame]	53	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
Vidhya Sudhan Loganathan	f1f4906	2018-05-25 13:21:26 +0100	[diff] [blame]	54	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
Vidhya Sudhan Loganathan	7485d5a	2018-07-04 09:34:00 +0100	[diff] [blame]	55	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	56	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	57	ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	58	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame]	59	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	60	ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	61	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	62
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	63	if(!is_interleaved_transposed)
				64	{
				65	ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	66	}
				67	else
				68	{
				69	const int m = reshape_info.m();
				70	const int n = reshape_info.n();
				71	const int k = reshape_info.k();
				72	const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
				73	const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
				74
				75	TensorShape tensor_shape0{ input0->tensor_shape() };
				76	tensor_shape0.set(0, k);
				77	tensor_shape0.set(1, m);
				78
				79	TensorShape tensor_shape1{ input1->tensor_shape() };
				80	tensor_shape1.set(0, n);
				81	tensor_shape1.set(1, k);
				82
				83	const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
				84	const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
				85
				86	const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
				87	const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
				88
				89	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
				90	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	91	}
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	92
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	93	if(output->total_size() != 0)
				94	{
				95	const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(input0, input1, is_interleaved_transposed, reshape_info));
				96	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
				97	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	98	}
				99
				100	return Status{};
				101	}
				102
				103	inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo input0, ITensorInfo input1, ITensorInfo *output,
Gian Marco Iodice	750641d	2018-05-08 12:01:57 +0100	[diff] [blame]	104	bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	105	ElementsProcessed &num_elements_processed)
				106	{
				107	bool window_changed = false;
				108	Window win{};
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	109	Window win_out{};
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	110
				111	const DataType data_type = input0->data_type();
				112	unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
				113	unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	114	bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
Gian Marco Iodice	3139f03	2018-11-05 14:26:32 +0000	[diff] [blame]	115	bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	116
				117	// In case both input and output have to be reinterpreted as 3D tensors,
				118	// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
				119	if(reinterpret_input_as_3d == reinterpret_output_as_3d)
				120	{
				121	reinterpret_input_as_3d = false;
				122	reinterpret_output_as_3d = false;
				123	}
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	124
Gian Marco Iodice	750641d	2018-05-08 12:01:57 +0100	[diff] [blame]	125	// Output tensor auto inizialitation if not yet initialized
Isabella Gottardi	c4f582e	2018-10-11 19:14:55 +0100	[diff] [blame]	126	auto_init_if_empty(output, input0->clone()->set_tensor_shape(compute_mm_shape(input0, *input1, is_interleaved_transposed, reshape_info)));
Gian Marco Iodice	750641d	2018-05-08 12:01:57 +0100	[diff] [blame]	127
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	128	TensorInfo tmp_info(*output);
				129
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	130	if(reinterpret_output_as_3d)
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	131	{
				132	// Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
				133	// the window needs to be constructed on the 2D collapsed version of the tensor
				134	TensorShape tmp_shape(output->tensor_shape());
				135	tmp_shape.collapse(2U, 1U);
				136	tmp_info.set_tensor_shape(tmp_shape);
				137	}
				138
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	139	if(is_interleaved_transposed)
				140	{
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	141	// reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
Isabella Gottardi	c4f582e	2018-10-11 19:14:55 +0100	[diff] [blame]	142	ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	143
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	144	// Configure kernel window
				145	num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
				146	num_elems_processed_per_iteration_y = 4;
				147
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	148	// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
				149	// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
				150	const int m = reshape_info.m();
				151	const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
				152
				153	win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				154	win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	155
				156	AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
Georgios Pinitas	535fedd	2018-05-04 18:52:25 +0100	[diff] [blame]	157	AccessWindowStatic input1_access(input1, 0, 0,
				158	ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
				159	ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	160	AccessWindowStatic output_access(output, 0, 0,
				161	ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
				162	output->dimension(1) + bottom_pad);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	163
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	164	window_changed = update_window_and_padding(win, input0_access, input1_access) \|\| // window used by the execute_window_loop
				165	update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	166
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	167	output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	168	}
				169	else // The input tensors have not been reshaped
				170	{
				171	// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
				172	num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
				173	num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
				174
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	175	// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
				176	// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	177	const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	178	const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
				179
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	180	// Create kernels according to the architecture, data type and input size.
Michalis Spyrou	a967611	2018-02-22 18:07:43 +0000	[diff] [blame]	181	GPUTarget arch_target = get_arch_from_target(gpu_target);
				182	if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	183	{
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	184	num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	185	}
				186
				187	// Configure window
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	188	win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				189	win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	190
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	191	AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	192	AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
				193	AccessWindowStatic output_access(output, 0, 0,
				194	ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
				195	output->dimension(1) + bottom_pad);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	196
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	197	window_changed = update_window_and_padding(win, input0_access, input1_access) \|\| // window used by the execute_window_loop
				198	update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	199
				200	Coordinates coord;
				201	coord.set_num_dimensions(output->num_dimensions());
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	202	output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	203	}
				204
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	205	// Collapse along the Z direction
				206	// This collapse needs to be here in order to tune the Z dimension of LWS
Gian Marco Iodice	81b28c4	2018-03-29 10:29:36 +0100	[diff] [blame]	207	Window collapsed = win;
				208	const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
				209	collapsed = win.collapse(win, dimension_to_collapse);
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	210
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	211	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	212	return std::make_pair(err, collapsed);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	213	}
				214	} // namespace
				215
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	216	CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	217	: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	218	{
				219	}
				220
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	221	void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor input0, const ICLTensor input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
				222	bool fp_mixed_precision)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	{
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	224	ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
				225
				226	// Perform validate step
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	227	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	228
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	229	_input0 = input0;
				230	_input1 = input1;
				231	_output = output;
				232	_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
Gian Marco Iodice	3139f03	2018-11-05 14:26:32 +0000	[diff] [blame]	233	_reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	234
Gian Marco Iodice	d39e2b1	2018-08-06 14:31:15 +0100	[diff] [blame]	235	// In case both input and output have to be reinterpreted as 3D tensors,
				236	// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
				237	if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
				238	{
				239	_reinterpret_input_as_3d = false;
				240	_reinterpret_output_as_3d = false;
				241	}
				242
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	243	// Check if we need to slide the matrix B
				244	const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
				245
				246	_slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	247
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	248	const DataType data_type = input0->info()->data_type();
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	249
				250	// Get target architecture
				251	GPUTarget gpu_target = get_target();
				252
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	253	ElementsProcessed num_elements_processed{};
				254
				255	// Configure kernel window
Gian Marco Iodice	750641d	2018-05-08 12:01:57 +0100	[diff] [blame]	256	auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	257	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier	b6eb353	2018-08-08 13:20:04 +0100	[diff] [blame]	258	ICLKernel::configure_internal(win_config.second);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	259
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	260	// Create build options
				261	CLBuildOptions build_opts;
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	262
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	263	// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	264	if(std::abs(1.0f - alpha) > 0.00001f)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	265	{
Vidhya Sudhan Loganathan	7485d5a	2018-07-04 09:34:00 +0100	[diff] [blame]	266	build_opts.add_option("-DALPHA=" + float_to_string_with_full_precision(alpha));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	267	}
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	268	build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
				269	build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
				270	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
				271	build_opts.add_option_if(_reinterpret_input_as_3d \|\| _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	272
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame]	273	// Do not slide matrix B if _slide_matrix_b = false
				274	build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
				275
Gian Marco Iodice	bb36a8e	2018-04-19 12:05:08 +0100	[diff] [blame]	276	const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
				277
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	278	std::string kernel_name;
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	279	if(is_interleaved_transposed)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	280	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	281	const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
				282	const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
				283
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	284	build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	285	build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
				286	build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
				287
Gian Marco Iodice	bb36a8e	2018-04-19 12:05:08 +0100	[diff] [blame]	288	if(is_data_type_float(data_type) && is_bifrost)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	289	{
Gian Marco Iodice	bb36a8e	2018-04-19 12:05:08 +0100	[diff] [blame]	290	kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	291	}
				292	else
				293	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	294	kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
Vidhya Sudhan Loganathan	38d93bd	2018-11-20 15:38:13 +0000	[diff] [blame]	295	if(fp_mixed_precision && data_type == DataType::F16)
				296	{
				297	// currently wider accumulator is only supported for fp16 kernels.
				298	kernel_name += "_acc32";
				299	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	300	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	301	}
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	302	else // The input tensors have not been reshaped
				303	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	304	build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
Gian Marco Iodice	e52a300	2018-04-11 15:59:10 +0100	[diff] [blame]	305	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	306
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	307	// Create kernels according to the architecture, data type and input size.
Gian Marco Iodice	bb36a8e	2018-04-19 12:05:08 +0100	[diff] [blame]	308	if(is_data_type_float(data_type) && is_bifrost)
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	309	{
Gian Marco Iodice	e52a300	2018-04-11 15:59:10 +0100	[diff] [blame]	310	kernel_name = "gemm_mm_floating_point";
				311
				312	if(input0->info()->num_dimensions() != 1)
Gian Marco Iodice	fd68311	2018-04-17 09:52:44 +0100	[diff] [blame]	313	{
Gian Marco Iodice	e52a300	2018-04-11 15:59:10 +0100	[diff] [blame]	314	kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
Vidhya Sudhan Loganathan	38d93bd	2018-11-20 15:38:13 +0000	[diff] [blame]	315	if(fp_mixed_precision && data_type == DataType::F16)
				316	{
				317	// currently wider accumulator is only supported for fp16 kernels.
				318	kernel_name += "_acc32";
				319	}
Gian Marco Iodice	e52a300	2018-04-11 15:59:10 +0100	[diff] [blame]	320	}
				321	else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
				322	{
				323	// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
				324	// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
				325	// FC6 and FC7 of AlexNet and VGG-16).
				326	kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
Gian Marco Iodice	fd68311	2018-04-17 09:52:44 +0100	[diff] [blame]	327	}
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	328
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	329	// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
				330	// via exhaustive autotuning over a range of representative layer configurations.
Anthony Barbier	b6eb353	2018-08-08 13:20:04 +0100	[diff] [blame]	331	set_lws_hint(cl::NDRange(4));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	332	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	333	else // (MIDGARD and F32) or (F16)
				334	{
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	335	kernel_name = "gemm_mm_floating_point";
				336	}
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	337	build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
				338	build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	339	}
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	340
				341	// Create kernel
				342	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				343
				344	// Set config_id for enabling LWS tuning
				345	_config_id = "gemm_";
				346	_config_id += (is_interleaved_transposed ? "reshaped_" : "");
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	347	_config_id += (fp_mixed_precision ? "fp_mixed_" : "");
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	348	_config_id += (_reinterpret_input_as_3d ? "3di_" : "");
				349	_config_id += (_reinterpret_output_as_3d ? "3do_" : "");
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	350	_config_id += lower_string(string_from_data_type(input0->info()->data_type()));
				351	_config_id += "_";
				352	_config_id += support::cpp11::to_string(output->info()->dimension(1));
				353	_config_id += "_";
				354	_config_id += support::cpp11::to_string(output->info()->dimension(0));
				355	_config_id += "_";
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	356	_config_id += support::cpp11::to_string(output->info()->dimension(2));
				357	_config_id += "_";
				358	_config_id += support::cpp11::to_string(output->info()->dimension(3));
				359	_config_id += "_";
Anton Lokhmotov	3e80c7f	2017-11-20 11:02:10 +0000	[diff] [blame]	360	_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	361	}
				362
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	363	Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	364	const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	365	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	366	// Note: num_elements_processed will be set in validate_and_configure_window()
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	367	ElementsProcessed num_elements_processed{};
				368	ARM_COMPUTE_UNUSED(alpha);
Vidhya Sudhan Loganathan	a25d16c	2018-11-16 11:33:12 +0000	[diff] [blame]	369	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	370	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
				371	input1->clone().get(),
				372	output->clone().get(),
				373	is_interleaved_transposed,
Gian Marco Iodice	750641d	2018-05-08 12:01:57 +0100	[diff] [blame]	374	reshape_info,
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	375	gpu_target,
				376	num_elements_processed)
				377	.first);
				378
				379	return Status{};
				380	}
				381
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	382	void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
				383	{
				384	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				385	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				386
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	387	if(_input1->info()->num_dimensions() < 3)
				388	{
				389	// The stride_z for matrix B must be zero if we do not slice
				390	ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
				391	}
				392
				393	Window slice = window.first_slice_window_3D();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	394	Window slice_matrix_b = slice;
Gian Marco Iodice	edfa9f4	2017-08-15 11:45:22 +0100	[diff] [blame]	395
				396	slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
				397	slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	398
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	399	if(_reinterpret_input_as_3d)
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	400	{
Isabella Gottardi	b92805b	2018-09-28 18:24:27 +0100	[diff] [blame]	401	// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
Georgios Pinitas	e8bd2c7	2018-07-11 15:54:56 +0100	[diff] [blame]	402	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
Gian Marco Iodice	68a3f56	2018-07-26 11:44:03 +0100	[diff] [blame]	403	const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
				404	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
				405	}
				406
				407	if(_reinterpret_output_as_3d)
				408	{
				409	// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
				410	const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
Georgios Pinitas	e8bd2c7	2018-07-11 15:54:56 +0100	[diff] [blame]	411	const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
				412	_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
Isabella Gottardi	8e74f44	2018-03-01 16:42:00 +0000	[diff] [blame]	413	}
				414
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	415	do
				416	{
				417	Window slice_b = slice;
				418	// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	419	// This scenario can happen when the matrix multiplication is used to perform a convolution operation
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame]	420	if(!_slide_matrix_b)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	421	{
				422	slice_b = slice_matrix_b;
				423	}
				424
				425	unsigned int idx = 0;
				426	add_2D_tensor_argument(idx, _input0, slice);
				427	add_2D_tensor_argument(idx, _input1, slice_b);
				428	add_2D_tensor_argument(idx, _output, slice);
Gian Marco Iodice	d2fab73	2018-03-02 11:18:12 +0000	[diff] [blame]	429	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
				430	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
				431	_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
Anthony Barbier	b6eb353	2018-08-08 13:20:04 +0100	[diff] [blame]	432	enqueue(queue, *this, slice, lws_hint());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433	}
Gian Marco	ae2af74	2018-02-15 12:35:44 +0000	[diff] [blame]	434	while(window.slide_window_slice_3D(slice));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	435	}