Blame - src/core/CL/kernels/CLSoftmaxLayerKernel.cpp - ml/ComputeLibrary

blob: 5d7142418942ea6aa73e9841206bcf540150eea8 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowStatic.h"
				27	#include "arm_compute/core/CL/CLHelpers.h"
				28	#include "arm_compute/core/CL/CLKernelLibrary.h"
				29	#include "arm_compute/core/CL/ICLTensor.h"
				30	#include "arm_compute/core/CL/OpenCL.h"
				31	#include "arm_compute/core/Helpers.h"
				32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Utils.h"
				34	#include "arm_compute/core/Validate.h"
				35	#include "arm_compute/core/Window.h"
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	36	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	37
				38	#include <set>
				39	#include <string>
				40
				41	using namespace arm_compute;
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	42
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	43	namespace
				44	{
				45	/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
				46	*
				47	* Prepares these build options:
				48	* -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
				49	* -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
				50	* it defines whether the value will be taken into account or not.
				51	*
				52	* @param[in] build_opts Build options to extend
				53	* @param[in] input_scale Input scaling factor
				54	* @param[in] beta Exponent scaling factor beta
				55	*/
				56	CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
				57	{
				58	// Number of integer bits in temporary fixed-point representation of current-to-max difference
				59	static const int scaled_diff_int_bits = 5;
				60	// Number of integer bits used in temporary fixed-point representation of exponent accumulator
				61	static const int exp_accumulation_in_bits = 12;
				62
				63	const double beta_multiplier = std::min(
				64	1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
				65	(1ll << 31) - 1.0);
				66	int input_beta_multiplier, input_beta_left_shift;
				67	quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
				68
				69	const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
				70	const int diff_min = -1.f * std::floor(max_input_rescaled);
				71
				72	CLBuildOptions build_opts;
				73	build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
				74	build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
				75	build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
				76	build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
				77	build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
				78
				79	return build_opts;
				80	}
				81	} // namespace
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	82
				83	void CLLogits1DMaxKernel::configure(const ICLTensor input, ICLTensor output)
				84	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	85	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	86
				87	// Softmax across the x dimension
				88	TensorShape output_shape{ input->info()->tensor_shape() };
				89	output_shape.set(0, 1);
				90
				91	// Output auto initialization if not yet initialized
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	92	auto_init_if_empty(*output->info(),
				93	output_shape,
				94	1,
				95	input->info()->data_type(),
				96	input->info()->fixed_point_position(),
				97	input->info()->quantization_info());
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	98
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	99	// Perform validation step
				100	ARM_COMPUTE_ERROR_THROW_ON(CLLogits1DMaxKernel::validate(input->info(), output->info()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	101
				102	_input = input;
				103	_output = output;
				104
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	105	const DataType data_type = input->info()->data_type();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	106	// The kernel loops over all elements in steps of 16
				107	const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
				108
				109	// Set build options
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	110	CLBuildOptions build_opts;
				111	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
				112	build_opts.add_option_if(is_data_type_fixed_point(data_type),
				113	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				114	build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	115	// Tell the kernel that the width is not a multiple of 16
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	116	build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	117
				118	// Create kernel
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	119	std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
				120	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	121
				122	// Set fixed arguments
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	123	unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
				125
				126	// Configure kernel window
				127	constexpr unsigned int num_elems_written_per_iteration = 1;
				128
				129	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
				130	AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
				131	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				132
				133	update_window_and_padding(win, input_access, output_access);
				134
				135	output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
				136
				137	ICLKernel::configure(win);
Giorgio Arena	63485ce	2017-11-15 16:04:20 +0000	[diff] [blame]	138
				139	// Set config_id for enabling LWS tuning
				140	_config_id = "softmax_layer_";
				141	_config_id += lower_string(string_from_data_type(data_type));
				142	_config_id += "_";
				143	_config_id += support::cpp11::to_string(input->info()->dimension(0));
				144	_config_id += "_";
				145	_config_id += support::cpp11::to_string(input->info()->dimension(1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	146	}
				147
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	148	Error CLLogits1DMaxKernel::validate(const ITensorInfo input, const ITensorInfo output)
				149	{
				150	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
				151	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
				152
				153	// Checks performed when output is configured
				154	if(output->total_size() != 0)
				155	{
				156	// Softmax across the x dimension
				157	TensorShape output_shape{ input->tensor_shape() };
				158	output_shape.set(0, 1);
				159
				160	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				161	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				162	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
				163	}
				164
				165	return Error{};
				166	}
				167
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	168	CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
				169	: _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
				170	{
				171	}
				172
Pablo Palmier	48a60f9	2017-10-18 11:03:08 +0100	[diff] [blame]	173	void CLLogits1DShiftExpSumKernel::configure(const ICLTensor input, const ICLTensor max, ICLTensor output, ICLTensor sum, float beta)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	175	ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	176
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	177	const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
				178	const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	179
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	180	// Output auto initialization if not yet initialized
				181	auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, tmp_data_type, input->info()->fixed_point_position());
				182	auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, tmp_data_type, input->info()->fixed_point_position());
				183
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	184	// Perform validation step
				185	ARM_COMPUTE_ERROR_THROW_ON(CLLogits1DShiftExpSumKernel::validate(input->info(), max->info(), output->info(), sum->info()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	186
				187	_input = input;
				188	_max = max;
				189	_output = output;
				190	_sum = sum;
				191
Georgios Pinitas	388d3ec	2017-11-02 12:17:56 +0000	[diff] [blame]	192	const DataType dt = input->info()->data_type();
				193	auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
				194
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	195	// The kernel loops over all elements in steps of 16
				196	const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
				197
				198	// Set build options
Georgios Pinitas	388d3ec	2017-11-02 12:17:56 +0000	[diff] [blame]	199	CLBuildOptions build_opts;
				200	build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
				201	build_opts.add_option_if(is_data_type_fixed_point(dt),
				202	std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
				203	build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	// Tell the kernel that the width is not a multiple of 16
Georgios Pinitas	388d3ec	2017-11-02 12:17:56 +0000	[diff] [blame]	205	build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
				206	build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
				207	build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	208	build_opts.add_options_if(is_quantized_asymmetric,
				209	prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
Pablo Palmier	48a60f9	2017-10-18 11:03:08 +0100	[diff] [blame]	210
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	211	// Create kernel
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	212	std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
				213	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214
				215	// Set fixed arguments
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	216	unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	217	_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
				218
				219	// Configure window
				220	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
				221
				222	AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
				223	AccessWindowHorizontal max_access(max->info(), 0, 1);
				224	AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
				225	AccessWindowHorizontal sum_access(sum->info(), 0, 1);
				226
				227	update_window_and_padding(win, input_access, max_access, output_access, sum_access);
				228
				229	output_access.set_valid_region(win, input->info()->valid_region());
				230	sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
				231
				232	ICLKernel::configure(win);
				233	}
				234
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	235	Error CLLogits1DShiftExpSumKernel::validate(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
				236	{
				237	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
				238	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
				239
				240	const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
				241
				242	// Checks performed when output is configured
				243	if(output->total_size() != 0)
				244	{
				245	if(is_quantized_asymmetric)
				246	{
				247	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
				248	}
				249	else
				250	{
				251	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				252	}
				253	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				254	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				255	}
				256
				257	// Checks performed when sum is configured
				258	if(sum->total_size() != 0)
				259	{
				260	if(is_quantized_asymmetric)
				261	{
				262	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
				263	}
				264	else
				265	{
				266	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
				267	}
				268	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
				269	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
				270	}
				271
				272	return Error{};
				273	}
				274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	275	void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
				276	{
				277	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				278	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				279
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	280	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				281	Window slice = window_collapsed.first_slice_window_3D();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	282
				283	do
				284	{
				285	unsigned int idx = 0;
				286	// Set inputs
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	287	add_3D_tensor_argument(idx, _input, slice);
				288	add_3D_tensor_argument(idx, _max, slice);
				289	add_3D_tensor_argument(idx, _output, slice);
				290	add_3D_tensor_argument(idx, _sum, slice);
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame^]	291	enqueue(queue, *this, slice, _lws_hint);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	292	}
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	293	while(window_collapsed.slide_window_slice_3D(slice));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	294	}
				295
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	296	/*< Grid size (obtained through auto-tuning) /
				297	const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
				298	/*< Vector size in the serial case (obtained through auto-tuning) /
				299	const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
				300	/*< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) ./
				301	const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
				302
				303	CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
				304	: _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
				305	{
				306	}
				307
				308	void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor input, ICLTensor max, ICLTensor output, ICLTensor sum, float beta)
				309	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	310	ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	311
				312	// Output auto initialization if not yet initialized
				313	auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
				314	auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
				315
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	316	// Perform validation step
				317	ARM_COMPUTE_ERROR_THROW_ON(CLLogits1DMaxShiftExpSumKernel::validate(input->info(), max->info(), output->info(), sum->info()));
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	318
				319	_input = input;
				320	_max = max;
				321	_output = output;
				322	_sum = sum;
				323
				324	const DataType dt = input->info()->data_type();
				325	const size_t reduction_dim_size = input->info()->dimension(0);
				326	auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
				327
				328	// Set build options
				329	CLBuildOptions build_opts;
				330	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
				331	build_opts.add_option_if(is_data_type_fixed_point(dt),
				332	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				333	build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
				334	build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
				335	build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
				336
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	337	_lws_hint = cl::NullRange;
				338	std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_serial");
				339	ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
				340	unsigned int vector_size = std::get<1>(parallel_reduction_info);
				341
				342	build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
				343	build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
				344	build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
				345
				346	// Configure parallel kernel if needed
				347	if(std::get<0>(parallel_reduction_info))
				348	{
				349	kernel_name = std::string("softmax_layer_max_shift_exp_sum_parallel");
				350	bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
				351	build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
				352
				353	// Handle boundary conditions.
				354	const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
				355	build_opts.add_option_if((multiple_grid_size != 0) \|\| ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame^]	356	// Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
				357	// A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
				358	_lws_hint = cl::NDRange(_grid_size);
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	359	}
				360
				361	// Create kernel.
				362	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
				363
				364	// Set static arguments. Both the kernels use the same arguments
				365	unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
				366	_kernel.setArg<cl_uint>(idx++, reduction_dim_size);
				367
				368	// Configure window
				369	const unsigned int num_elems_x = ceil_to_multiple(input->info()->tensor_shape().x(), vector_size);
				370	Window win = calculate_max_window(*input->info(), Steps(num_elems_x));
				371
				372	AccessWindowHorizontal input_access(input->info(), 0, num_elems_x);
				373	AccessWindowHorizontal max_access(max->info(), 0, 1);
				374	AccessWindowHorizontal output_access(output->info(), 0, num_elems_x);
				375	AccessWindowHorizontal sum_access(sum->info(), 0, 1);
				376
				377	update_window_and_padding(win, input_access, max_access, output_access, sum_access);
				378
				379	output_access.set_valid_region(win, input->info()->valid_region());
				380	sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
				381
				382	ICLKernel::configure(win);
				383	}
				384
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	385	Error CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo input, const ITensorInfo max, const ITensorInfo output, const ITensorInfo sum)
				386	{
				387	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
				388	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
				389
				390	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
				391	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
				392
				393	// Checks performed when output is configured
				394	if(output->total_size() != 0)
				395	{
				396	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				397	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				398	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				399	}
				400
				401	// Checks performed when sum is configured
				402	if(sum->total_size() != 0)
				403	{
				404	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
				405	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
				406	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
				407	}
				408
				409	return Error{};
				410	}
				411
Chunosov	d6afedc	2017-11-06 22:09:45 +0700	[diff] [blame]	412	CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
				413	{
				414	bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
				415	unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
				416	return std::make_tuple(is_parallel_reduction, vector_size);
				417	}
				418
				419	void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
				420	{
				421	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				422	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				423
				424	// Collapse window in Z dimension
				425	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				426
				427	// Reconfigure window in case of parallel reduction
				428	ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
				429	if(std::get<0>(parallel_reduction_info))
				430	{
				431	// To launch grid_size parallel workitems, steps.x should be modified as follows.
				432	const unsigned int step = std::get<1>(parallel_reduction_info);
				433	window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
				434	}
				435
				436	// Get slices
				437	Window slice = window_collapsed.first_slice_window_3D();
				438	do
				439	{
				440	unsigned int idx = 0;
				441	// Set inputs
				442	add_3D_tensor_argument(idx, _input, slice);
				443	add_3D_tensor_argument(idx, _max, slice);
				444	add_3D_tensor_argument(idx, _output, slice);
				445	add_3D_tensor_argument(idx, _sum, slice);
				446	enqueue(queue, *this, slice, _lws_hint);
				447	}
				448	while(window_collapsed.slide_window_slice_3D(slice));
				449	}
				450
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	451	CLLogits1DNormKernel::CLLogits1DNormKernel()
				452	: _input(nullptr), _sum(nullptr), _output(nullptr)
				453	{
				454	}
				455
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	456	void CLLogits1DNormKernel::configure(const ICLTensor input, const ICLTensor sum, ICLTensor *output, float beta)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	457	{
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	458	ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	459
				460	// Note: output should always have a scale of 1/256 and offset 0
				461	const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
				462	const bool is_quantized_asymmetric = (input->info()->data_type() == DataType::S32);
				463	const DataType output_data_type = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	464
				465	// Output auto initialization if not yet initialized
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	466	auto_init_if_empty(*output->info(),
Georgios Pinitas	283c179	2017-11-10 18:14:06 +0000	[diff] [blame]	467	input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
Georgios Pinitas	d368df3	2017-07-04 11:06:15 +0100	[diff] [blame]	468
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	469	// Perform validation step
				470	ARM_COMPUTE_ERROR_THROW_ON(CLLogits1DNormKernel::validate(input->info(), sum->info(), output->info()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	471
				472	_input = input;
				473	_sum = sum;
				474	_output = output;
				475
				476	// Set build options
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	477	CLBuildOptions build_opts;
				478	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
				479	build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
				480	"-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
				481	build_opts.add_options_if(is_quantized_asymmetric,
				482	prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	483
				484	// Create kernel
Chunosov	f450caa	2017-11-08 16:09:35 +0700	[diff] [blame]	485	std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
				486	_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	487
				488	// Configure window
				489	constexpr unsigned int num_elems_processed_per_iteration = 16;
				490
				491	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
				492
				493	AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
				494	AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
				495	AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
				496
				497	update_window_and_padding(win, input_access, sum_access, output_access);
				498
				499	output_access.set_valid_region(win, input->info()->valid_region());
				500
				501	ICLKernel::configure(win);
				502	}
				503
Georgios Pinitas	30902ed	2017-11-14 15:32:57 +0000	[diff] [blame]	504	Error CLLogits1DNormKernel::validate(const ITensorInfo input, const ITensorInfo sum, const ITensorInfo *output)
				505	{
				506	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
				507	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
				508	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
				509	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
				510
				511	// Note: output should always have a scale of 1/256 and offset 0
				512	const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
				513	const bool is_quantized_asymmetric = (input->data_type() == DataType::S32);
				514
				515	// Checks performed when output is configured
				516	if(output->total_size() != 0)
				517	{
				518	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				519	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
				520	if(!is_quantized_asymmetric)
				521	{
				522	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				523	}
				524	else
				525	{
				526	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
				527	ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
				528	}
				529	}
				530
				531	return Error{};
				532	}
				533
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	534	void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
				535	{
				536	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				537	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				538
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	539	Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
				540	Window slice = window_collapsed.first_slice_window_3D();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	541
				542	do
				543	{
				544	Window sum_slice = slice;
				545	sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
				546
				547	unsigned int idx = 0;
				548	// Set inputs
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	549	add_3D_tensor_argument(idx, _input, slice);
				550	add_3D_tensor_argument(idx, _sum, sum_slice);
				551	add_3D_tensor_argument(idx, _output, slice);
Georgios Pinitas	11f0999	2017-11-27 11:18:34 +0000	[diff] [blame^]	552	enqueue(queue, *this, slice, _lws_hint);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	553	}
steniu01	0d523cc	2017-07-13 14:24:23 +0100	[diff] [blame]	554	while(window_collapsed.slide_window_slice_3D(slice));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	555	}