Blame - src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp - ml/ComputeLibrary

blob: 2e78107a1a422abfb041d68d4bd9a282829e31ee [file] [log] [blame]

Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2020 Arm Limited.
Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	24	#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	25
Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	29	#include "arm_compute/core/Types.h"
				30	#include "arm_compute/core/Utils.h"
				31	#include "arm_compute/core/Validate.h"
				32	#include "arm_compute/core/Window.h"
				33	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	34	#include "src/core/AccessWindowStatic.h"
Georgios Pinitas	ddb93bb	2020-10-02 16:38:59 +0100	[diff] [blame]	35	#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	36	#include "src/core/helpers/AutoConfiguration.h"
				37	#include "src/core/helpers/WindowHelpers.h"
Luca Foschiani	4b86953	2020-02-13 15:07:36 +0000	[diff] [blame]	38
				39	#include <arm_neon.h>
				40	#include <cstddef>
				41	#include <cstdint>
				42
				43	namespace arm_compute
				44	{
				45	Status validate_arguments(const ITensorInfo input, const ITensorInfo bias, const ITensorInfo output, const GEMMLowpOutputStageInfo output_stage)
				46	{
				47	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
				48
				49	ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
				50	ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
				51	\|\| output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
				52
				53	// Check biases if exist
				54	if(bias != nullptr)
				55	{
				56	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
				57	ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
				58	ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
				59	}
				60
				61	if(output->total_size() != 0)
				62	{
				63	if(output->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 \|\| output_stage->output_data_type == DataType::QASYMM8_SIGNED))
				64	{
				65	ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
				66	}
				67
				68	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				69	}
				70
				71	return Status{};
				72	}
				73
				74	inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
				75	{
				76	// Add the offset terms to GEMM's result
				77	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
				78	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
				79	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
				80	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
				81
				82	// Multiply by result_mult_int
				83	in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
				84	in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
				85	in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
				86	in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
				87	}
				88
				89	template <typename T>
				90	inline typename std::enable_if<std::is_same<T, uint8_t>::value,
				91	typename wrapper::traits::neon_vector<T, 16>::type>::type
				92	convert_to_8bit(const int16x8x2_t in_s16)
				93	{
				94	return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
				95	}
				96
				97	template <typename T>
				98	inline typename std::enable_if<std::is_same<T, int8_t>::value,
				99	typename wrapper::traits::neon_vector<T, 16>::type>::type
				100	convert_to_8bit(const int16x8x2_t in_s16)
				101	{
				102	return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
				103	}
				104
				105	template <typename T>
				106	inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
				107	typename wrapper::traits::neon_vector<T, 16>::type max)
				108	{
				109	// Shift final result (negative value shift right)
				110	in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
				111	in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
				112	in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
				113	in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
				114
				115	// Convert S32 to S16
				116	const int16x8x2_t in_s16 =
				117	{
				118	{
				119	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				120	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				121	}
				122	};
				123
				124	// Convert S16 to S8 or U8
				125	typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
				126
				127	out = wrapper::vmax(out, min);
				128	out = wrapper::vmin(out, max);
				129
				130	return out;
				131	}
				132
				133	class Coordinates;
				134
				135	template <typename T>
				136	void NEGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window)
				137	{
				138	using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
				139
				140	const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset);
				141	const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->gemmlowp_shift);
				142	const int window_step_x = 16;
				143	const auto window_start_x = static_cast<int>(window.x().start());
				144	const auto window_end_x = static_cast<int>(window.x().end());
				145
				146	const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest();
				147	const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max();
				148
				149	VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
				150	VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
				151
				152	Window win(window);
				153	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				154
				155	Iterator in(_input, win);
				156	Iterator out(_output, win);
				157
				158	if(_bias != nullptr)
				159	{
				160	Window win_biases;
				161	win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
				162	win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
				163
				164	Iterator bias(_bias, win_biases);
				165	execute_window_loop(win, [&](const Coordinates &)
				166	{
				167	// Compute 16 elements per iteration
				168	int x = window_start_x;
				169	for(; x <= (window_end_x - window_step_x); x += window_step_x)
				170	{
				171	int32x4x4_t in_s32 =
				172	{
				173	{
				174	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
				175	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
				176	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
				177	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
				178	}
				179	};
				180
				181	const int32x4x4_t bias_s32 =
				182	{
				183	{
				184	vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
				185	vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
				186	vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
				187	vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
				188	}
				189	};
				190
				191	// Add the bias to GEMM's result
				192	in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
				193	in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
				194	in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
				195	in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
				196
				197	// Add the offset terms to GEMM's result and multiply by result_mult_int
				198	scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
				199
				200	wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
				201	}
				202
				203	// Compute left-over elements
				204	for(; x < window_end_x; ++x)
				205	{
				206	const int bias_value = (reinterpret_cast<const int >(bias.ptr()) + x);
				207	int in_value = (reinterpret_cast<const int >(in.ptr()) + x);
				208
				209	// Quantize
				210	in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
				211
				212	// Store the result
				213	*(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
				214	}
				215	},
				216	in, bias, out);
				217	}
				218	else
				219	{
				220	execute_window_loop(win, [&](const Coordinates &)
				221	{
				222	// Compute 16 elements per iteration
				223	int x = window_start_x;
				224	for(; x <= (window_end_x - window_step_x); x += window_step_x)
				225	{
				226	int32x4x4_t in_s32 =
				227	{
				228	{
				229	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
				230	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
				231	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
				232	vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
				233	}
				234	};
				235
				236	// Add the offset terms to GEMM's result and multiply by result_mult_int
				237	scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
				238
				239	wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
				240	}
				241
				242	// Compute left-over elements
				243	for(; x < window_end_x; ++x)
				244	{
				245	int in_value = (reinterpret_cast<const int >(in.ptr()) + x);
				246
				247	// Quantize
				248	in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
				249
				250	// Store the result
				251	*(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
				252	}
				253	},
				254	in, out);
				255	}
				256	}
				257
				258	NEGEMMLowpQuantizeDownInt32ScaleKernel::NEGEMMLowpQuantizeDownInt32ScaleKernel()
				259	: _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr), _is_bounded_relu(false)
				260	{
				261	}
				262
				263	void NEGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ITensor input, const ITensor bias, ITensor output, const GEMMLowpOutputStageInfo output_stage)
				264	{
				265	// Perform validate step
				266	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, output_stage);
				267
				268	// Output auto inizialitation if not yet initialized
				269	auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
				270
				271	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
				272	(bias != nullptr) ? bias->info() : nullptr,
				273	output->info(),
				274	output_stage));
				275
				276	_input = input;
				277	_bias = bias;
				278	_output = output;
				279	_output_stage = output_stage;
				280
				281	// Configure kernel window
				282	Window win = calculate_max_window(*input->info(), Steps());
				283	Coordinates coord;
				284	coord.set_num_dimensions(output->info()->num_dimensions());
				285	output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
				286
				287	INEKernel::configure(win);
				288
				289	// Check if we need to clamp the result using min and max
				290	_is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
				291	&& !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
				292	&& _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
				293	if(_output_stage->output_data_type == DataType::QASYMM8)
				294	{
				295	_func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<uint8_t>;
				296	}
				297	else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
				298	{
				299	_func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<int8_t>;
				300	}
				301	else
				302	{
				303	ARM_COMPUTE_ERROR("Data type not supported");
				304	}
				305	}
				306
				307	Status NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo input, const ITensorInfo bias, const ITensorInfo output, const GEMMLowpOutputStageInfo output_stage)
				308	{
				309	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				310	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
				311
				312	return Status{};
				313	}
				314
				315	void NEGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, const ThreadInfo &info)
				316	{
				317	ARM_COMPUTE_UNUSED(info);
				318	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				319	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				320
				321	(this->*_func)(window);
				322	}
				323	} // namespace arm_compute