Blame - src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp - ml/ComputeLibrary

blob: 63aed6df32d841dafc4272a468c260baa1ef068f [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowTranspose.h"
				27	#include "arm_compute/core/CL/CLHelpers.h"
				28	#include "arm_compute/core/CL/CLKernelLibrary.h"
				29	#include "arm_compute/core/CL/ICLTensor.h"
				30	#include "arm_compute/core/CL/OpenCL.h"
				31	#include "arm_compute/core/Error.h"
				32	#include "arm_compute/core/Helpers.h"
				33	#include "arm_compute/core/Types.h"
				34	#include "arm_compute/core/Validate.h"
				35	#include "arm_compute/core/Window.h"
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	36	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	37
				38	#include <cmath>
				39
				40	using namespace arm_compute;
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	41	using namespace arm_compute::misc::shape_calculator;
				42
				43	namespace
				44	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	45	Status validate_arguments(const ITensorInfo input, const ITensorInfo output, int mult_transpose1xW_width)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	46	{
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	47	ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	48	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
				49	DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
				50	DataType::F16, DataType::F32);
				51
				52	if(output->total_size() != 0)
				53	{
				54	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	55	compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	56	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				57	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
				58	}
				59
				60	return Status{};
				61	}
				62
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	63	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input, ITensorInfo output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	64	{
				65	num_elems_processed_per_iteration = 16 / input->element_size();
				66
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	67	const int scale_x = num_elems_processed_per_iteration * mult_transpose1xW_width;
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	68	bool window_changed = false;
				69
				70	// Configure kernel window
				71	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
				72
				73	if((win.x().end() / scale_x) == 0)
				74	{
				75	return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win);
				76	}
				77
				78	AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
				79	window_changed = window_changed \|\| update_window_and_padding(win, input_access);
				80
				81	// Configure window in case of configured output
				82	if(output->total_size() != 0)
				83	{
				84	AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
				85	window_changed = window_changed \|\| update_window_and_padding(win, output_access);
				86	output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
				87	}
				88
				89	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				90	return std::make_pair(err, win);
				91	}
				92	} // namespace
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	93
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	94	void CLGEMMTranspose1xWKernel::configure(const ICLTensor input, ICLTensor output, int mult_transpose1xW_width)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	95	{
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	96	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	97
				98	// Output tensor auto inizialitation if not yet initialized
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	99	auto_init_if_empty(output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(input->info(), mult_transpose1xW_width)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	101	// Perform validate step
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	102	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width));
Georgios Pinitas	4cbee6e	2017-06-19 13:02:56 +0100	[diff] [blame]	103
				104	_input = input;
				105	_output = output;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	106
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	107	// Configure kernel window
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	108	// Note: num_elems_processed_per_iteration will be set in validate_and_configure_window()
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	109	unsigned int num_elems_processed_per_iteration = 1;
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	110	auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width);
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	111	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				112	ICLKernel::configure(win_config.second);
				113
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	114	// Create build options
				115	CLBuildOptions build_opts;
				116	build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration));
				117	build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
				118
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	119	/*
				120	* Following an example of how the transposition1xW works when the input data type is F32
				121	*
				122	* \|a00 a01 a02 a03\|
				123	* \|a10 a11 a12 a13\|
				124	* \|a20 a21 a22 a23\| = \| a00 a01 a02 a03 \|\| a10 a11 a12 a13 \|\| a20 a21 a22 a23 \|\| a30 a31 a32 a33 \|
				125	* \|a30 a31 a32 a33\|
				126	*
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	127	* The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	128	*/
				129	// Create kernel
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	130	std::string kernel_name = "gemm_transpose1xW";
				131	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	132	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	134	Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo input, const ITensorInfo output, int mult_transpose1xW_width)
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	135	{
				136	unsigned int num_elems_processed_per_iteration = 1;
Gian Marco	36a0a46	2018-01-12 10:21:40 +0000	[diff] [blame]	137	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width));
				138	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	139
Georgios Pinitas	358ca20	2017-12-07 16:47:52 +0000	[diff] [blame]	140	return Status{};
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	141	}
				142
				143	void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
				144	{
				145	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				146	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				147
				148	// Output is transposed
				149	Window out_window(window);
				150	out_window.set(Window::DimX, window.y());
				151	out_window.set(Window::DimY, window.x());
				152
				153	Window in_slice = window.first_slice_window_2D();
				154	Window out_slice = out_window.first_slice_window_2D();
				155
				156	do
				157	{
				158	unsigned int idx = 0;
				159	add_2D_tensor_argument(idx, _input, in_slice);
				160	add_2D_tensor_argument(idx, _output, out_slice);
				161	enqueue(queue, *this, in_slice, _lws_hint);
				162	}
				163	while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
				164	}