Blame - src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp - ml/ComputeLibrary

blob: 57380b4546563e39981da141604bd09b08f9526b [file] [log] [blame]

Michele Di Giorgio	9175392	2019-06-13 10:56:59 +0100	[diff] [blame^]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
				25
				26	#include "arm_compute/core/CPP/Validate.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/NEON/NEMath.h"
				30	#include "arm_compute/core/NEON/wrapper/wrapper.h"
				31	#include "arm_compute/core/TensorInfo.h"
				32	#include "arm_compute/core/Types.h"
				33	#include "arm_compute/core/Window.h"
				34
				35	namespace arm_compute
				36	{
				37	namespace
				38	{
				39	Status validate_arguments(const ITensorInfo input, const ITensorInfo output, float epsilon)
				40	{
				41	ARM_COMPUTE_UNUSED(epsilon);
				42	ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
				43	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
				44	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Input tensor cannot have more than 2 dimensions");
				45	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
				46
				47	// Checks performed when output is configured
				48	if((output != nullptr) && (output->total_size() != 0))
				49	{
				50	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				51	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				52	}
				53	return Status{};
				54	}
				55
				56	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input, ITensorInfo output)
				57	{
				58	if(output != nullptr)
				59	{
				60	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				61	// Output auto inizialitation if not yet initialized
				62	auto_init_if_empty(output, input);
				63	}
				64
				65	// This kernel doesn't need padding. A left-over for loop on dimension X, we cannot have any read or write out of memory
				66	// For this reason num_elems_processed_per_iteration is set to 1
				67	Window win = calculate_max_window(*input, Steps());
				68	if(output != nullptr)
				69	{
				70	output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
				71	}
				72
				73	return std::make_pair(Status{}, win);
				74	}
				75	} // namespace
				76
				77	template <typename ScalarType, int size>
				78	void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
				79	{
				80	using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
				81
				82	// Set build options
				83	Window win = window;
				84	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				85
				86	const int window_step_x = size;
				87	const auto window_start_x = static_cast<int>(window.x().start());
				88	const auto window_end_x = static_cast<int>(window.x().end());
				89
				90	Iterator input(_input, win);
				91	Iterator output(_output, win);
				92
				93	execute_window_loop(win, [&](const Coordinates &)
				94	{
				95	int x = window_start_x;
				96	auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
				97	auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
				98
				99	auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
				100	auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
				101
				102	for(; x <= (window_end_x - window_step_x); x += window_step_x)
				103	{
				104	auto data = wrapper::vloadq(in_ptr + x);
				105	sum_vec = wrapper::vadd(sum_vec, data);
				106	sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
				107	}
				108
				109	auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
				110	auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
				111	for(int i = 0; i < size / 4; ++i)
				112	{
				113	sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res);
				114	sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
				115	}
				116
				117	auto sum = wrapper::vgetlane(sum_carry_res, 0);
				118	auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
				119
				120	// Compute left-over elements
				121	for(; x < window_end_x; ++x)
				122	{
				123	ScalarType data = *(in_ptr + x);
				124	sum += data;
				125	sum_sq += data * data;
				126	}
				127
				128	ScalarType mean = sum / _input->info()->dimension(0);
				129	ScalarType var = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
				130	ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
				131
				132	auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
				133	auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
				134	for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
				135	{
				136	auto data = wrapper::vloadq(in_ptr + x);
				137	auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
				138	// Store results
				139	wrapper::vstore(out_ptr + x, res);
				140	}
				141	for(; x < window_end_x; ++x)
				142	{
				143	(out_ptr + x) = ((in_ptr + x) - mean) * stddev_inv;
				144	}
				145	},
				146	input, output);
				147	}
				148
				149	NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
				150	: _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
				151	{
				152	}
				153
				154	void NEMeanStdDevNormalizationKernel::configure(ITensor input, ITensor output, float epsilon)
				155	{
				156	ARM_COMPUTE_ERROR_ON_NULLPTR(input);
				157
				158	ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
				159
				160	_input = input;
				161	_output = (output == nullptr) ? input : output;
				162	_epsilon = epsilon;
				163
				164	// Configure kernel window
				165	auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
				166	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				167	ICPPKernel::configure(win_config.second);
				168
				169	// Configure function to run based on different data types
				170	const DataType data_type = input->info()->data_type();
				171	switch(data_type)
				172	{
				173	case DataType::F32:
				174	_func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
				175	break;
				176	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				177	case DataType::F16:
				178	_func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
				179	break;
				180	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				181	default:
				182	ARM_COMPUTE_ERROR("Not Supported");
				183	break;
				184	}
				185	}
				186
				187	Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo input, const ITensorInfo output, float epsilon)
				188	{
				189	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
				190	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
				191	return Status{};
				192	}
				193
				194	void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo &info)
				195	{
				196	ARM_COMPUTE_UNUSED(info);
				197	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				198	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				199	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				200
				201	(this->*_func)(window);
				202	}
				203	} // namespace arm_compute