Blame - src/core/CL/kernels/CLArithmeticDivisionKernel.cpp - ml/ComputeLibrary

blob: 9bd0da15a322cde699cfa59dafbcee1a0b27cade [file] [log] [blame]

Michalis Spyrou	0a88792	2018-06-11 16:30:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
				25
				26	#include "arm_compute/core/CL/CLHelpers.h"
				27	#include "arm_compute/core/CL/CLValidate.h"
				28	#include "arm_compute/core/CL/ICLTensor.h"
				29
				30	using namespace arm_compute;
				31
				32	namespace
				33	{
				34	constexpr unsigned int num_elems_processed_per_iteration = 16;
				35
				36	Status validate_arguments(const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output)
				37	{
				38	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
				39	ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
				40	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
				41	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				42
				43	const TensorShape out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
				44
				45	ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
				46
				47	// Validate in case of configured output
				48	if(output->total_size() > 0)
				49	{
				50	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
				51	ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
				52	"Wrong shape for output");
				53	}
				54
				55	return Status{};
				56	}
				57
				58	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input1, ITensorInfo input2, ITensorInfo *output)
				59	{
				60	const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
				61	const TensorShape &out_shape = broadcast_pair.first;
				62	const ValidRegion &valid_region = broadcast_pair.second;
				63
				64	// Auto initialize output if not initialized
				65	{
				66	set_shape_if_empty(*output, out_shape);
				67
				68	if(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
				69	{
				70	set_format_if_unknown(*output, Format::F16);
				71	}
				72	else if(input1->data_type() == DataType::F32 \|\| input2->data_type() == DataType::F32)
				73	{
				74	set_format_if_unknown(*output, Format::F32);
				75	}
				76	}
				77
				78	Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
				79	Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
				80	Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
				81
				82	AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
				83	AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
				84	AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
				85
				86	bool window_changed = update_window_and_padding(win_input1, input1_access)
				87	\|\| update_window_and_padding(win_input2, input2_access)
				88	\|\| update_window_and_padding(win, output_access);
				89
				90	output_access.set_valid_region(win, valid_region);
				91
				92	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				93	return std::make_pair(err, win);
				94	}
				95	} // namespace
				96
				97	CLArithmeticDivisionKernel::CLArithmeticDivisionKernel()
				98	: _input1(nullptr), _input2(nullptr), _output(nullptr)
				99	{
				100	}
				101
				102	void CLArithmeticDivisionKernel::configure(const ICLTensor input1, const ICLTensor input2, ICLTensor *output)
				103	{
				104	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
				105	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
				106
				107	// Configure kernel window
				108	auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
				109	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				110
				111	_input1 = input1;
				112	_input2 = input2;
				113	_output = output;
				114
				115	// Set kernel build options
				116	std::set<std::string> build_opts;
				117	build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
				118	build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
				119	build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
				120
				121	// Create kernel
				122	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_div", build_opts));
				123
				124	ICLKernel::configure(win_config.second);
				125	}
				126
				127	Status CLArithmeticDivisionKernel::validate(const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output)
				128	{
				129	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
				130	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
				131
				132	return Status{};
				133	}
				134
				135	void CLArithmeticDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
				136	{
				137	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				138	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
				139
				140	const TensorShape &in_shape1 = _input1->info()->tensor_shape();
				141	const TensorShape &in_shape2 = _input2->info()->tensor_shape();
				142	const TensorShape &out_shape = _output->info()->tensor_shape();
				143
				144	bool can_collapse = true;
				145	if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
				146	{
				147	can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
				148	for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
				149	{
				150	can_collapse = (in_shape1[d] == in_shape2[d]);
				151	}
				152	}
				153
				154	bool has_collapsed = false;
				155	Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
				156
				157	const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
				158	const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
				159
				160	Window slice = collapsed.first_slice_window_3D();
				161	Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
				162	Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
				163
				164	do
				165	{
				166	unsigned int idx = 0;
				167
				168	add_3D_tensor_argument(idx, _input1, slice_input1);
				169	add_3D_tensor_argument(idx, _input2, slice_input2);
				170	add_3D_tensor_argument(idx, _output, slice);
				171
				172	enqueue(queue, *this, slice);
				173
				174	collapsed.slide_window_slice_3D(slice_input1);
				175	collapsed.slide_window_slice_3D(slice_input2);
				176	}
				177	while(collapsed.slide_window_slice_3D(slice));
				178	}
				179
				180	BorderSize CLArithmeticDivisionKernel::border_size() const
				181	{
				182	const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
				183	const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
				184	return BorderSize(0, border, 0, 0);
				185	}