Blame - src/core/CL/kernels/CLSoftmaxLayerKernel.cpp - ml/ComputeLibrary

blob: 04a7639a83837ee4db7912388bf258cdd94c9d64 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowStatic.h"
				27	#include "arm_compute/core/CL/CLHelpers.h"
				28	#include "arm_compute/core/CL/CLKernelLibrary.h"
				29	#include "arm_compute/core/CL/ICLTensor.h"
				30	#include "arm_compute/core/CL/OpenCL.h"
				31	#include "arm_compute/core/Helpers.h"
				32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Utils.h"
				34	#include "arm_compute/core/Validate.h"
				35	#include "arm_compute/core/Window.h"
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	36	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	37
				38	#include <set>
				39	#include <string>
				40
				41	using namespace arm_compute;
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	42
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	43	namespace
				44	{
				45	/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
				46	*
				47	* Prepares these build options:
				48	* -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
				49	* -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
				50	* it defines whether the value will be taken into account or not.
				51	*
				52	* @param[in] build_opts Build options to extend
				53	* @param[in] input_scale Input scaling factor
				54	* @param[in] beta Exponent scaling factor beta
				55	*/
				56	CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
				57	{
				58	// Number of integer bits in temporary fixed-point representation of current-to-max difference
				59	static const int scaled_diff_int_bits = 5;
				60	// Number of integer bits used in temporary fixed-point representation of exponent accumulator
				61	static const int exp_accumulation_in_bits = 12;
				62
				63	const double beta_multiplier = std::min(
				64	1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
				65	(1ll << 31) - 1.0);
				66	int input_beta_multiplier, input_beta_left_shift;
				67	quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
				68
				69	const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
				70	const int diff_min = -1.f * std::floor(max_input_rescaled);
				71
				72	CLBuildOptions build_opts;
				73	build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
				74	build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
				75	build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
				76	build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
				77	build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
				78
				79	return build_opts;
				80	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	81
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	82	// Arguments Validation
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	83
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	84	Status validate_arguments_1DMax(const ITensorInfo input, const ITensorInfo output)
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	85	{
				86	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
				87	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
				88
				89	// Checks performed when output is configured
				90	if(output->total_size() != 0)
				91	{
				92	// Softmax across the x dimension
				93	TensorShape output_shape{ input->tensor_shape() };
				94	output_shape.set(0, 1);
				95
				96	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				97	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				98	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
				99	}
				100
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	101	return Status{};
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	102	}
				103
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	104	Status validate_arguments_1DShiftExpSum(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	105	{
				106	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
				107	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
				108
				109	const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
				110
				111	// Checks performed when output is configured
				112	if(output->total_size() != 0)
				113	{
				114	if(is_quantized_asymmetric)
				115	{
				116	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
				117	}
				118	else
				119	{
				120	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				121	}
				122	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				123	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				124	}
				125
				126	// Checks performed when sum is configured
				127	if(sum->total_size() != 0)
				128	{
				129	if(is_quantized_asymmetric)
				130	{
				131	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
				132	}
				133	else
				134	{
				135	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
				136	}
				137	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
				138	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
				139	}
				140
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	141	return Status{};
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	142	}
				143
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	144	Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	145	{
				146	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
				147	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
				148
				149	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
				150	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
				151
				152	// Checks performed when output is configured
				153	if(output->total_size() != 0)
				154	{
				155	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				156	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				157	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				158	}
				159
				160	// Checks performed when sum is configured
				161	if(sum->total_size() != 0)
				162	{
				163	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
				164	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
				165	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
				166	}
				167
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	168	return Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	169	}
				170
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	171	Status validate_arguments_1DNorm(const ITensorInfo input, const ITensorInfo sum, const ITensorInfo *output)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	172	{
				173	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
				174	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
				175	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
				176	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
				177
				178	// Note: output should always have a scale of 1/256 and offset 0
				179	const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
				180	const bool is_quantized_asymmetric = (input->data_type() == DataType::S32);
				181
				182	// Checks performed when output is configured
				183	if(output->total_size() != 0)
				184	{
				185	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				186	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				187	if(!is_quantized_asymmetric)
				188	{
				189	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				190	}
				191	else
				192	{
				193	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
				194	ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
				195	}
				196	}
				197
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	198	return Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	199	}
				200
				201	// Window validation
				202
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	203	std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo input, ITensorInfo output)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	204	{
				205	TensorShape output_shape{ input->tensor_shape() };
				206	output_shape.set(0, 1);
				207
				208	// Output auto initialization if not yet initialized
				209	auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
				210
				211	// The kernel loops over all elements in steps of 16
				212	const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
				213	constexpr unsigned int num_elems_written_per_iteration = 1;
				214
				215	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
				216	AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
				217	AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
				218
				219	bool window_changed = update_window_and_padding(win, input_access, output_access);
				220
				221	output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
				222
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	223	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	224	return std::make_pair(err, win);
				225	}
				226
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	227	std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo input, ITensorInfo max, ITensorInfo output, ITensorInfo sum)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	228	{
				229	const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
				230	const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->data_type();
				231
				232	// Output auto initialization if not yet initialized
				233	auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position()));
				234	auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type));
				235
				236	// The kernel loops over all elements in steps of 16
				237	const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
				238
				239	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
				240
				241	AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
				242	AccessWindowHorizontal max_access(max, 0, 1);
				243	AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
				244	AccessWindowHorizontal sum_access(sum, 0, 1);
				245
				246	bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
				247
				248	output_access.set_valid_region(win, input->valid_region());
				249	sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
				250
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	251	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	252	return std::make_pair(err, win);
				253	}
				254
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	255	std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo input, ITensorInfo max, ITensorInfo output, ITensorInfo sum)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	256	{
				257	// Output auto initialization if not yet initialized
				258	auto_init_if_empty(*sum, input->clone()->set_tensor_shape(max->tensor_shape()));
				259	auto_init_if_empty(output, input->clone());
				260
				261	CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo parallel_reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(input->dimension(0));
				262	unsigned int vector_size = std::get<1>(parallel_reduction_info);
				263	const unsigned int num_elems_x = ceil_to_multiple(input->tensor_shape().x(), vector_size);
				264	Window win = calculate_max_window(*input, Steps(num_elems_x));
				265
				266	AccessWindowHorizontal input_access(input, 0, num_elems_x);
				267	AccessWindowHorizontal max_access(max, 0, 1);
				268	AccessWindowHorizontal output_access(output, 0, num_elems_x);
				269	AccessWindowHorizontal sum_access(sum, 0, 1);
				270
				271	bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
				272
				273	output_access.set_valid_region(win, input->valid_region());
				274	sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
				275
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	276	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	277	return std::make_pair(err, win);
				278	}
				279
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	280	std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo input, ITensorInfo output, ITensorInfo *sum)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	281	{
				282	const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
				283	const bool is_quantized_asymmetric = (input->data_type() == DataType::S32);
				284	const DataType output_data_type = is_quantized_asymmetric ? DataType::QASYMM8 : input->data_type();
				285
				286	// Output auto initialization if not yet initialized
				287	auto_init_if_empty(*output,
				288	input->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
				289
				290	constexpr unsigned int num_elems_processed_per_iteration = 16;
				291
				292	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
				293
				294	AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
				295	AccessWindowStatic sum_access(sum, 0, 0, 1, sum->dimension(1));
				296	AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
				297
				298	bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
				299
				300	output_access.set_valid_region(win, input->valid_region());
				301
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	302	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	303	return std::make_pair(err, win);
				304	}
				305
				306	} // namespace
				307
				308	void CLLogits1DMaxKernel::configure(const ICLTensor input, ICLTensor output)
				309	{
				310	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				311
				312	TensorShape output_shape{ input->info()->tensor_shape() };
				313	output_shape.set(0, 1);
				314
				315	// Output auto initialization if not yet initialized
				316	auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
				317
				318	// Perform validation step
				319	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info()));
				320
				321	_input = input;
				322	_output = output;
				323
				324	const DataType data_type = input->info()->data_type();
				325
				326	// Set build options
				327	CLBuildOptions build_opts;
				328	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
				329	build_opts.add_option_if(is_data_type_fixed_point(data_type),
				330	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				331	build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
				332	// Tell the kernel that the width is not a multiple of 16
				333	build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
				334
				335	// Create kernel
				336	std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
				337	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				338
				339	// Set fixed arguments
				340	unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
				341	_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
				342
				343	// Configure kernel window
				344	auto win_config = validate_and_configure_window_1DMax(input->info(), output->info());
				345	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				346	ICLKernel::configure(win_config.second);
				347
				348	// Set config_id for enabling LWS tuning
				349	_config_id = "softmax_layer_";
				350	_config_id += lower_string(string_from_data_type(data_type));
				351	_config_id += "_";
				352	_config_id += support::cpp11::to_string(input->info()->dimension(0));
				353	_config_id += "_";
				354	_config_id += support::cpp11::to_string(input->info()->dimension(1));
				355	}
				356
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	357	Status CLLogits1DMaxKernel::validate(const ITensorInfo input, const ITensorInfo output)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	358	{
				359	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output));
				360	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first);
				361
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	362	return Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	363	}
				364
				365	CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
				366	: _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
				367	{
				368	}
				369
				370	void CLLogits1DShiftExpSumKernel::configure(const ICLTensor input, const ICLTensor max, ICLTensor output, ICLTensor sum, float beta)
				371	{
				372	ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
				373
				374	const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
				375	const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
				376
				377	// Output auto initialization if not yet initialized
				378	auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position()));
				379	auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type));
				380
				381	// Perform validation step
				382	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
				383
				384	_input = input;
				385	_max = max;
				386	_output = output;
				387	_sum = sum;
				388
				389	const DataType dt = input->info()->data_type();
				390	auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
				391
				392	// Set build options
				393	CLBuildOptions build_opts;
				394	build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
				395	build_opts.add_option_if(is_data_type_fixed_point(dt),
				396	std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
				397	build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
				398	// Tell the kernel that the width is not a multiple of 16
				399	build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
				400	build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
				401	build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
				402	build_opts.add_options_if(is_quantized_asymmetric,
				403	prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
				404
				405	// Create kernel
				406	std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
				407	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				408
				409	// Set fixed arguments
				410	unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
				411	_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
				412
				413	// Configure window
				414	auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info());
				415	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				416	ICLKernel::configure(win_config.second);
				417	}
				418
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	419	Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	420	{
				421	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum));
				422	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
				423
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	424	return Status{};
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	425	}
				426
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	427	void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
				428	{
				429	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				430	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				431
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	432	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				433	Window slice = window_collapsed.first_slice_window_3D();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	434
				435	do
				436	{
				437	unsigned int idx = 0;
				438	// Set inputs
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	439	add_3D_tensor_argument(idx, _input, slice);
				440	add_3D_tensor_argument(idx, _max, slice);
				441	add_3D_tensor_argument(idx, _output, slice);
				442	add_3D_tensor_argument(idx, _sum, slice);
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame]	443	enqueue(queue, *this, slice, _lws_hint);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	444	}
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	445	while(window_collapsed.slide_window_slice_3D(slice));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	446	}
				447
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	448	/*< Grid size (obtained through auto-tuning) /
				449	const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
				450	/*< Vector size in the serial case (obtained through auto-tuning) /
				451	const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
				452	/*< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) ./
				453	const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
				454
				455	CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
				456	: _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
				457	{
				458	}
				459
				460	void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor input, ICLTensor max, ICLTensor output, ICLTensor sum, float beta)
				461	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	462	ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	463
				464	// Output auto initialization if not yet initialized
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	465	auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
				466	auto_init_if_empty(output->info(), input->info()->clone());
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	467
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	468	// Perform validation step
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	469	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	470
				471	_input = input;
				472	_max = max;
				473	_output = output;
				474	_sum = sum;
				475
				476	const DataType dt = input->info()->data_type();
				477	const size_t reduction_dim_size = input->info()->dimension(0);
				478	auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
				479
				480	// Set build options
				481	CLBuildOptions build_opts;
				482	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
				483	build_opts.add_option_if(is_data_type_fixed_point(dt),
				484	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				485	build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
				486	build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
				487	build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
				488
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	489	_lws_hint = cl::NullRange;
				490	std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_serial");
				491	ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
				492	unsigned int vector_size = std::get<1>(parallel_reduction_info);
				493
				494	build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
				495	build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
				496	build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
				497
				498	// Configure parallel kernel if needed
				499	if(std::get<0>(parallel_reduction_info))
				500	{
				501	kernel_name = std::string("softmax_layer_max_shift_exp_sum_parallel");
				502	bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
				503	build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
				504
				505	// Handle boundary conditions.
				506	const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
				507	build_opts.add_option_if((multiple_grid_size != 0) \|\| ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame]	508	// Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
				509	// A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
				510	_lws_hint = cl::NDRange(_grid_size);
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	511	}
				512
				513	// Create kernel.
				514	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				515
				516	// Set static arguments. Both the kernels use the same arguments
				517	unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
				518	_kernel.setArg<cl_uint>(idx++, reduction_dim_size);
				519
				520	// Configure window
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	521	auto win_config = validate_and_configure_window_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info());
				522	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				523	ICLKernel::configure(win_config.second);
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	524	}
				525
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	526	Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	527	{
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	528	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
				529	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMaxShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	530
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	531	return Status{};
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	532	}
				533
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	534	CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
				535	{
				536	bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
				537	unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
				538	return std::make_tuple(is_parallel_reduction, vector_size);
				539	}
				540
				541	void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
				542	{
				543	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				544	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				545
				546	// Collapse window in Z dimension
				547	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				548
				549	// Reconfigure window in case of parallel reduction
				550	ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
				551	if(std::get<0>(parallel_reduction_info))
				552	{
				553	// To launch grid_size parallel workitems, steps.x should be modified as follows.
				554	const unsigned int step = std::get<1>(parallel_reduction_info);
				555	window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
				556	}
				557
				558	// Get slices
				559	Window slice = window_collapsed.first_slice_window_3D();
				560	do
				561	{
				562	unsigned int idx = 0;
				563	// Set inputs
				564	add_3D_tensor_argument(idx, _input, slice);
				565	add_3D_tensor_argument(idx, _max, slice);
				566	add_3D_tensor_argument(idx, _output, slice);
				567	add_3D_tensor_argument(idx, _sum, slice);
				568	enqueue(queue, *this, slice, _lws_hint);
				569	}
				570	while(window_collapsed.slide_window_slice_3D(slice));
				571	}
				572
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	573	CLLogits1DNormKernel::CLLogits1DNormKernel()
				574	: _input(nullptr), _sum(nullptr), _output(nullptr)
				575	{
				576	}
				577
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	578	void CLLogits1DNormKernel::configure(const ICLTensor input, const ICLTensor sum, ICLTensor *output, float beta)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	579	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	580	ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	581
				582	// Note: output should always have a scale of 1/256 and offset 0
				583	const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
				584	const bool is_quantized_asymmetric = (input->info()->data_type() == DataType::S32);
				585	const DataType output_data_type = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	586
				587	// Output auto initialization if not yet initialized
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	588	auto_init_if_empty(*output->info(),
Georgios Pinitas	283c179	2017-11-10 18:14:06 +0000	[diff] [blame]	589	input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	590
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	591	// Perform validation step
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	592	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(input->info(), sum->info(), output->info()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	593
				594	_input = input;
				595	_sum = sum;
				596	_output = output;
				597
				598	// Set build options
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	599	CLBuildOptions build_opts;
				600	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
				601	build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
				602	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				603	build_opts.add_options_if(is_quantized_asymmetric,
				604	prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	605
				606	// Create kernel
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	607	std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
				608	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	609
				610	// Configure window
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	611	auto win_config = validate_and_configure_window_1DNorm(input->info(), output->info(), sum->info());
				612	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				613	ICLKernel::configure(win_config.second);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	614	}
				615
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	616	Status CLLogits1DNormKernel::validate(const ITensorInfo input, const ITensorInfo sum, const ITensorInfo *output)
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	617	{
Giorgio Arena	b8ab997	2017-11-29 15:09:39 +0000	[diff] [blame]	618	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output));
				619	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DNorm(input->clone().get(), output->clone().get(), sum->clone().get()).first);
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	620
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	621	return Status{};
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	622	}
				623
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	624	void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
				625	{
				626	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				627	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				628
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	629	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				630	Window slice = window_collapsed.first_slice_window_3D();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	631
				632	do
				633	{
				634	Window sum_slice = slice;
				635	sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
				636
				637	unsigned int idx = 0;
				638	// Set inputs
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	639	add_3D_tensor_argument(idx, _input, slice);
				640	add_3D_tensor_argument(idx, _sum, sum_slice);
				641	add_3D_tensor_argument(idx, _output, slice);
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame]	642	enqueue(queue, *this, slice, _lws_hint);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	643	}
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	644	while(window_collapsed.slide_window_slice_3D(slice));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	645	}