Blame - src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp - ml/ComputeLibrary

blob: 99004462186ae75213c84402adbc382e24dd811a [file] [log] [blame]

Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	1	/*
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Giorgio Arena	04a8f8c	2017-11-23 11:45:24 +0000	[diff] [blame]	24	#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/NEON/NEMath.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/core/Window.h"
				34
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	35	#include "arm_compute/core/NEON/wrapper/wrapper.h"
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	36	#include <arm_neon.h>
				37	#include <cmath>
				38
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	39	namespace arm_compute
				40	{
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	41	namespace
				42	{
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	43	constexpr int max_input_tensor_dim = 3;
				44
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	45	template <typename T, int S>
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	46	void l2_normalize_X(const ITensor in, const ITensor sum, ITensor *out, float epsilon, const Window &window)
				47	{
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	48	/** NEON vector tag type. */
				49	using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
				50
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	51	Window window_sum(window);
				52	window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
				53
				54	Window in_slice = window.first_slice_window_1D();
				55	Window sum_slice = window_sum.first_slice_window_1D();
				56
				57	do
				58	{
				59	Iterator input_it(in, in_slice);
				60	Iterator sum_it(sum, sum_slice);
				61	Iterator output_it(out, in_slice);
				62
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	63	const auto sum_value = reinterpret_cast<const T >(sum_it.ptr());
				64	const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	65
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	66	execute_window_loop(in_slice, [&](const Coordinates &)
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	67	{
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	68	const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
				69	const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	70
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	71	wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	72	},
				73	input_it, output_it);
				74	}
				75	while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
				76	}
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	77
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	78	template <typename T, int S>
				79	void l2_normalize_Y(const ITensor in, const ITensor sum, ITensor *out, float epsilon, const Window &window)
				80	{
				81	/** NEON vector tag type. */
				82	using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
				83
				84	Window window_sum(window);
				85	window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
				86
				87	Window in_slice = window.first_slice_window_2D();
				88	Window sum_slice = window_sum.first_slice_window_2D();
				89
				90	do
				91	{
				92	Iterator input_it(in, in_slice);
				93	Iterator sum_it(sum, sum_slice);
				94	Iterator output_it(out, in_slice);
				95
				96	auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
				97
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	98	execute_window_loop(in_slice, [&](const Coordinates &)
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	99	{
				100	const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
				101	const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
				102	const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
				103
				104	const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
				105	wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
				106	},
				107	input_it, sum_it, output_it);
				108	}
				109	while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
				110	}
				111
				112	template <typename T, int S>
				113	void l2_normalize_Z(const ITensor in, const ITensor sum, ITensor *out, float epsilon, const Window &window)
				114	{
				115	/** NEON vector tag type. */
				116	using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
				117
				118	Window window_sum(window);
				119	window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
				120
				121	Window in_slice = window.first_slice_window_3D();
				122	Window sum_slice = window_sum.first_slice_window_3D();
				123
				124	do
				125	{
				126	Iterator input_it(in, in_slice);
				127	Iterator sum_it(sum, sum_slice);
				128	Iterator output_it(out, in_slice);
				129
				130	auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
				131
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	132	execute_window_loop(in_slice, [&](const Coordinates &)
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	133	{
				134	const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
				135	const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
				136	const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
				137
				138	const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
				139	wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
				140	},
				141	input_it, sum_it, output_it);
				142	}
				143	while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
				144	}
				145
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	146	Status validate_arguments(const ITensorInfo input, const ITensorInfo sum, const ITensorInfo *output, int axis, float epsilon)
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	147	{
				148	ARM_COMPUTE_UNUSED(epsilon);
				149
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	150	const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	151	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
				152	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	153	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	154	ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
				155	ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	156
				157	// Reduce shape on axis
				158	TensorShape sum_shape = input->tensor_shape();
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	159	sum_shape.set(actual_axis, 1);
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	160	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
				161
				162	if(output->total_size() != 0)
				163	{
				164	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				165	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				166	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	167	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	168	}
				169
				170	return Status{};
				171	}
				172
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	173	std::tuple<Status, Window> validate_and_configure_window(ITensorInfo input, ITensorInfo sum, ITensorInfo *output, int axis)
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	174	{
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	175	const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	176	const unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	177	const unsigned int num_elems_processed_per_iteration_sum = (actual_axis == 0) ? 1 : num_elems_processed_per_iteration;
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	178
				179	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
				180
				181	// Output auto initialization if not yet initialized
Vidhya Sudhan Loganathan	7485d5a	2018-07-04 09:34:00 +0100	[diff] [blame]	182	auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	183
				184	AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
				185	AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
				186	AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
				187
				188	bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
				189	output_access.set_valid_region(win, input->valid_region());
				190
				191	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
				192
				193	return std::make_tuple(err, win);
				194	}
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	195	} // namespace
				196
Giorgio Arena	04a8f8c	2017-11-23 11:45:24 +0000	[diff] [blame]	197	NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	198	: _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	199	{
				200	}
				201
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	202	void NEL2NormalizeLayerKernel::configure(const ITensor input, const ITensor sum, ITensor *output, int axis, float epsilon)
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	203	{
				204	ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	205	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	206
				207	_input = input;
				208	_sum = sum;
				209	_output = output;
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	210	_actual_axis = wrap_around(axis, max_input_tensor_dim);
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	211	_epsilon = epsilon;
				212
				213	// Configure kernel window
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	214	auto win_config = validate_and_configure_window(_input->info(), _sum->info(), _output->info(), axis);
				215	ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	216
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	217	INEKernel::configure(std::get<1>(win_config));
				218	}
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	219
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	220	Status NEL2NormalizeLayerKernel::validate(const ITensorInfo input, const ITensorInfo sum, const ITensorInfo *output, int axis, float epsilon)
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	221	{
				222	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
				223	ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), sum->clone().get(), output->clone().get(), axis)));
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	224
John Richardson	73d4aef	2018-05-08 14:34:33 +0100	[diff] [blame]	225	return Status{};
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	226	}
				227
Giorgio Arena	04a8f8c	2017-11-23 11:45:24 +0000	[diff] [blame]	228	void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	229	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	230	ARM_COMPUTE_UNUSED(info);
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	231	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				232	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				233
Manuel Bottini	4b5c588	2019-05-14 10:38:30 +0100	[diff] [blame]	234	switch(_actual_axis)
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	235	{
				236	case 0:
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	237	switch(_input->info()->data_type())
				238	{
				239	case DataType::F32:
				240	l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window);
				241	break;
				242	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				243	case DataType::F16:
				244	l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window);
				245	break;
				246	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				247	default:
				248	ARM_COMPUTE_ERROR("Not implemented");
				249	}
				250	break;
				251	case 1:
				252	switch(_input->info()->data_type())
				253	{
				254	case DataType::F32:
				255	l2_normalize_Y<float, 4>(_input, _sum, _output, _epsilon, window);
				256	break;
				257	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				258	case DataType::F16:
				259	l2_normalize_Y<float16_t, 8>(_input, _sum, _output, _epsilon, window);
				260	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				261	break;
				262	default:
				263	ARM_COMPUTE_ERROR("Not implemented");
				264	}
				265	break;
				266	case 2:
				267	switch(_input->info()->data_type())
				268	{
				269	case DataType::F32:
				270	l2_normalize_Z<float, 4>(_input, _sum, _output, _epsilon, window);
				271	break;
				272	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				273	case DataType::F16:
				274	l2_normalize_Z<float16_t, 8>(_input, _sum, _output, _epsilon, window);
				275	break;
				276	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				277	default:
				278	ARM_COMPUTE_ERROR("Not implemented");
				279	}
Georgios Pinitas	d976958	2017-08-03 10:19:40 +0100	[diff] [blame]	280	break;
				281	default:
				282	ARM_COMPUTE_ERROR("Unsupported normalization axis");
				283	}
				284	}
Michalis Spyrou	2897e61	2018-11-20 18:38:29 +0000	[diff] [blame]	285	} // namespace arm_compute