Blame - src/core/CL/kernels/CLTileKernel.cpp - ml/ComputeLibrary

blob: 7559e7ae72798af6c9e3afde8d131808a3838bfb [file] [log] [blame]

giuros01	3175fcf	2018-11-21 09:59:17 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLTileKernel.h"
				25
				26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLKernelLibrary.h"
				28	#include "arm_compute/core/CL/CLValidate.h"
				29	#include "arm_compute/core/CL/ICLTensor.h"
				30	#include "arm_compute/core/Helpers.h"
				31	#include "arm_compute/core/IAccessWindow.h"
				32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Utils.h"
				34	#include "arm_compute/core/Validate.h"
				35	#include "arm_compute/core/Window.h"
				36	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				37
				38	namespace arm_compute
				39	{
				40	namespace
				41	{
				42	Status validate_arguments(const ITensorInfo input, const ITensorInfo output, const Multiples &multiples)
				43	{
				44	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
				45	ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
				46	ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
				47	ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
				48	{
				49	return e == 0;
				50	}));
				51
				52	// Validate output if initialized
				53	if(output->total_size() != 0)
				54	{
				55	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
				56	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				57	}
				58
				59	return Status{};
				60	}
				61	} // namespace
				62
				63	CLTileKernel::CLTileKernel()
				64	: _input(nullptr), _output(nullptr)
				65	{
				66	}
				67
				68	void CLTileKernel::configure(const ICLTensor input, ICLTensor output, const Multiples &multiples)
				69	{
				70	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				71
				72	// Auto initialize output
				73	TensorShape tiled_shape = misc::shape_calculator::compute_tiled_shape(input->info()->tensor_shape(), multiples);
				74	auto_init_if_empty(*output->info(), tiled_shape, 1, input->info()->data_type());
				75
				76	// Validate
				77	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), multiples));
				78
				79	_input = input;
				80	_output = output;
				81
				82	const DataType data_type = input->info()->data_type();
				83	const int vec_size_x = 16 / input->info()->element_size();
				84	const int input_width_x = input->info()->tensor_shape().x();
				85	const unsigned int offset = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
				86	const bool multi_access_x = (input_width_x / vec_size_x > 0);
				87
				88	// Create kernel
				89	CLBuildOptions build_opts;
				90	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
				91	build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width_x));
				92	build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
				93	build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
				94	build_opts.add_option("-DSRC_BATCHES=" + support::cpp11::to_string(input->info()->dimension(3)));
				95	build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
				96	build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
				97	build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
				98	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("tile", build_opts.options()));
				99
				100	// Configure window without padding
				101	Window win = calculate_max_window(*output->info());
				102
				103	if(multi_access_x)
				104	{
				105	// If multi-access is enabled, no thread should cross the tile boundaries. This means we need
				106	// as many threads as those to cover a single tile times multiples[0]. Note that if threads
				107	// do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
				108	// we don't need to pad the output
				109	const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
				110	win.set(Window::DimX,
				111	Window::Dimension(win.x().start(), size_win_x, vec_size_x));
				112	}
				113
				114	ICLKernel::configure_internal(win);
				115
				116	// Set config_id for enabling LWS tuning
				117	_config_id = "tile";
				118	_config_id += "_";
				119	_config_id += lower_string(string_from_data_type(input->info()->data_type()));
				120	for(unsigned int i = 0; i < multiples.size(); ++i)
				121	{
				122	_config_id += "_";
				123	_config_id += support::cpp11::to_string(input->info()->dimension(i));
				124	_config_id += "_";
				125	_config_id += support::cpp11::to_string(multiples[i]);
				126	}
				127	}
				128
				129	Status CLTileKernel::validate(const ITensorInfo input, const ITensorInfo output, const Multiples &multiples)
				130	{
				131	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, multiples));
				132	return Status{};
				133	}
				134
				135	void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
				136	{
				137	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				138	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				139
				140	Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				141	Window slice = collapsed.first_slice_window_4D();
				142
				143	do
				144	{
				145	unsigned int idx = 0;
				146	add_4D_tensor_argument(idx, _input, slice);
				147	add_4D_tensor_argument(idx, _output, slice);
				148	enqueue(queue, *this, slice);
				149	}
				150	while(collapsed.slide_window_slice_4D(slice));
				151	}
				152	} // namespace arm_compute