Blame - src/core/CL/kernels/CLElementwiseOperationKernel.cpp - ml/ComputeLibrary

blob: 37eeeb78bf9c3ae072d5257554166b0df468be99 [file] [log] [blame]

giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
				25
				26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLValidate.h"
				28	#include "arm_compute/core/CL/ICLTensor.h"
				29	#include <map>
				30
				31	namespace arm_compute
				32	{
				33	namespace
				34	{
				35	constexpr unsigned int num_elems_processed_per_iteration = 16;
				36
				37	std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
				38	{
				39	{ ArithmeticOperation::ADD, "ADD" },
				40	{ ArithmeticOperation::SUB, "SUB" },
				41	{ ArithmeticOperation::DIV, "DIV" },
				42	{ ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
				43	{ ArithmeticOperation::MIN, "MIN" },
				44	{ ArithmeticOperation::MAX, "MAX" },
				45	};
				46
				47	std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
				48	{
				49	{ ArithmeticOperation::ADD, "ADD" },
				50	{ ArithmeticOperation::SUB, "SUB" },
				51	};
				52
				53	std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
				54	{
				55	std::string config_id;
				56	// Set config_id for enabling LWS tuning
				57	config_id = kernel_name;
				58	config_id += "_";
				59	config_id += lower_string(string_from_data_type(input1.data_type()));
				60	config_id += "_";
				61	config_id += support::cpp11::to_string(output.dimension(0));
				62	config_id += "_";
				63	config_id += support::cpp11::to_string(output.dimension(1));
				64	return config_id;
				65	}
				66
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	67	Status validate_arguments_with_division_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				68	{
				69	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&input1, &input2, &output);
				70	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
				71	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
				72	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
				73
				74	const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
				75
				76	ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
				77
				78	// Validate in case of configured output
				79	if(output.total_size() > 0)
				80	{
				81	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
				82	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
				83	ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
				84	"Wrong shape for output");
				85	}
				86
				87	return Status{};
				88	}
				89
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	90	Status validate_arguments_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				91	{
				92	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
				93	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
				94	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
				95	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
				96
				97	const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) \|\| is_data_type_quantized_asymmetric(input2.data_type());
				98	if(is_qasymm)
				99	{
				100	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
				101	}
				102
				103	const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
				104
				105	ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
				106
				107	// Validate in case of configured output
				108	if(output.total_size() > 0)
				109	{
				110	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
				111	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
				112	ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) \|\| (input2.data_type() != DataType::U8)),
				113	"Output can only be U8 if both inputs are U8");
				114	ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
				115	"Wrong shape for output");
				116	if(is_qasymm)
				117	{
				118	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
				119	}
				120	}
				121	return Status{};
				122	}
				123
				124	CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, const std::string &operation_string)
				125	{
				126	CLBuildOptions build_opts;
				127
				128	build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type()));
				129	build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type()));
				130	build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type()));
				131	build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
				132	build_opts.add_option("-DOP=" + operation_string);
				133	if(is_data_type_quantized_asymmetric(input1.data_type()))
				134	{
				135	build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1.quantization_info().offset));
				136	build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2.quantization_info().offset));
				137	build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output.quantization_info().offset));
				138	build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1.quantization_info().scale));
				139	build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2.quantization_info().scale));
				140	build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output.quantization_info().scale));
				141	}
				142	return build_opts;
				143	}
				144
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	145	std::pair<Status, Window> configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
				146	{
				147	Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
				148	Window win_input1 = win.broadcast_if_dimension_le_one(input1);
				149	Window win_input2 = win.broadcast_if_dimension_le_one(input2);
				150
				151	AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
				152	AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
				153	AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
				154
				155	bool window_changed = update_window_and_padding(win_input1, input1_access)
				156	\|\| update_window_and_padding(win_input2, input2_access)
				157	\|\| update_window_and_padding(win, output_access);
				158
				159	output_access.set_valid_region(win, valid_region);
				160
				161	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				162	return std::make_pair(err, win);
				163	}
				164
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	165	std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
				166	{
				167	const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
				168	const TensorShape &out_shape = broadcast_pair.first;
				169	const ValidRegion &valid_region = broadcast_pair.second;
				170
				171	set_shape_if_empty(output, out_shape);
				172
				173	if(input1.data_type() == DataType::S16 \|\| input2.data_type() == DataType::S16)
				174	{
				175	set_format_if_unknown(output, Format::S16);
				176	}
				177	else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
				178	{
				179	set_format_if_unknown(output, Format::F16);
				180	}
				181	else if(input1.data_type() == DataType::F32 \|\| input2.data_type() == DataType::F32)
				182	{
				183	set_format_if_unknown(output, Format::F32);
				184	}
				185
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	186	return configure_window_arithmetic_common(valid_region, input1, input2, output);
				187	}
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	188
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	189	std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
				190	{
				191	const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
				192	const TensorShape &out_shape = broadcast_pair.first;
				193	const ValidRegion &valid_region = broadcast_pair.second;
				194	auto_init_if_empty(output, out_shape, 1, input1.data_type());
				195	return configure_window_arithmetic_common(valid_region, input1, input2, output);
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	196	}
				197	} // namespace
				198
				199	CLElementwiseOperationKernel::CLElementwiseOperationKernel()
				200	: _input1(nullptr), _input2(nullptr), _output(nullptr)
				201	{
				202	}
				203
				204	void CLElementwiseOperationKernel::configure_common(const ICLTensor input1, const ICLTensor input2, ICLTensor *output)
				205	{
				206	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
				207	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), *output->info()));
				208
				209	// Configure kernel window
				210	auto win_config = validate_and_configure_window(input1->info(), input2->info(), *output->info());
				211	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				212
				213	_input1 = input1;
				214	_input2 = input2;
				215	_output = output;
				216
				217	std::string kernel_name = "elementwise_operation_" + name();
				218	if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
				219	{
				220	kernel_name += "_quantized";
				221	}
				222
				223	// Set kernel build options
				224	CLBuildOptions build_opts = generate_build_options(input1->info(), input2->info(), *output->info());
				225
				226	// Create kernel
				227	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				228
				229	ICLKernel::configure_internal(win_config.second);
				230
				231	_config_id = generate_id_for_tuning(kernel_name, input1->info(), output->info());
				232	}
				233
				234	void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
				235	{
				236	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				237	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				238
				239	const TensorShape &in_shape1 = _input1->info()->tensor_shape();
				240	const TensorShape &in_shape2 = _input2->info()->tensor_shape();
				241	const TensorShape &out_shape = _output->info()->tensor_shape();
				242
				243	bool can_collapse = true;
				244	const bool is_vector = in_shape1.num_dimensions() == 1 \|\| in_shape2.num_dimensions() == 1;
				245	if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
				246	{
				247	can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
				248	for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
				249	{
				250	can_collapse = (in_shape1[d] == in_shape2[d]);
				251	}
				252	}
				253
				254	bool has_collapsed = false;
				255	Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
				256
				257	const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
				258	const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
				259
				260	Window slice = collapsed.first_slice_window_3D();
				261	Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
				262	Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
				263
				264	do
				265	{
				266	unsigned int idx = 0;
				267
				268	add_3D_tensor_argument(idx, _input1, slice_input1);
				269	add_3D_tensor_argument(idx, _input2, slice_input2);
				270	add_3D_tensor_argument(idx, _output, slice);
				271
				272	enqueue(queue, *this, slice, lws_hint());
				273
				274	collapsed.slide_window_slice_3D(slice_input1);
				275	collapsed.slide_window_slice_3D(slice_input2);
				276	}
				277	while(collapsed.slide_window_slice_3D(slice));
				278	}
				279
				280	BorderSize CLElementwiseOperationKernel::border_size() const
				281	{
				282	const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
				283	const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
				284	return BorderSize(0, border, 0, 0);
				285	}
				286
				287	/** Arithmetic operations with saturation*/
				288
				289	void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor input1, const ICLTensor input2, ICLTensor *output, const ConvertPolicy &policy)
				290	{
				291	_policy = policy;
				292	_op = op;
				293	configure_common(input1, input2, output);
				294	}
				295
				296	Status CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output, const ConvertPolicy &policy)
				297	{
				298	ARM_COMPUTE_UNUSED(op, policy);
				299	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
				300	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(input1, input2, *output));
				301	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(input1->clone(), input2->clone(), *output->clone()).first);
				302
				303	return Status{};
				304	}
				305
				306	std::pair<Status, Window> CLSaturatedArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
				307	{
				308	return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
				309	}
				310
				311	Status CLSaturatedArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				312	{
				313	return validate_arguments_with_arithmetic_rules(input1, input2, output);
				314	}
				315
				316	CLBuildOptions CLSaturatedArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				317	{
				318	const bool has_float_out = is_data_type_float(output.data_type());
				319	auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
				320	build_options.add_option((_policy == ConvertPolicy::WRAP \|\| has_float_out) ? "-DWRAP" : "-DSATURATE");
				321	return build_options;
				322	}
				323	std::string CLSaturatedArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
				324	{
				325	auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
				326	config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
				327	config_id += lower_string(string_from_data_layout(input1.data_layout()));
				328	return config_id;
				329	}
				330
				331	std::string CLSaturatedArithmeticOperationKernel::name()
				332	{
				333	return supported_sat_arithmetic_ops[_op];
				334	}
				335
				336	/** Arithmetic operations*/
				337
				338	void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor input1, const ICLTensor input2, ICLTensor *output)
				339	{
				340	_op = op;
				341	configure_common(input1, input2, output);
				342	}
				343
				344	Status CLArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output)
				345	{
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	346	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	347	if(op == ArithmeticOperation::DIV)
				348	{
				349	// Division doesn't support integer arithmetic
				350	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_division_rules(input1, input2, *output));
				351	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(input1->clone(), input2->clone(), *output->clone()).first);
				352	}
				353	else
				354	{
				355	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(input1, input2, *output));
				356	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(input1->clone(), input2->clone(), *output->clone()).first);
				357	}
				358
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	359	return Status{};
				360	}
				361	std::pair<Status, Window> CLArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
				362	{
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	363	if(_op == ArithmeticOperation::DIV)
				364	{
				365	// Division doesn't support integer arithmetic
				366	return validate_and_configure_window_for_division(input1, input2, output);
				367	}
				368	else
				369	{
				370	return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
				371	}
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	372	}
				373	Status CLArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				374	{
giuros01	49f7c02	2018-12-03 19:25:22 +0000	[diff] [blame]	375	if(_op == ArithmeticOperation::DIV)
				376	{
				377	// Division doesn't support integer arithmetic
				378	return validate_arguments_with_division_rules(input1, input2, output);
				379	}
				380	else
				381	{
				382	return validate_arguments_with_arithmetic_rules(input1, input2, output);
				383	}
giuros01	164a272	2018-11-20 18:34:46 +0000	[diff] [blame]	384	}
				385
				386	CLBuildOptions CLArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
				387	{
				388	return generate_build_options_with_arithmetic_rules(input1, input2, output, name());
				389	}
				390	std::string CLArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
				391	{
				392	return generate_id_for_tuning_common(kernel_name, input1, output);
				393	}
				394
				395	std::string CLArithmeticOperationKernel::name()
				396	{
				397	return supported_arithmetic_ops[_op];
				398	}
				399	} // namespace arm_compute