Blame - src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp - ml/ComputeLibrary

blob: 31d982c4e38b4ab110ebf69995b4cc8e12655dff [file] [log] [blame]

Manuel Bottini	769c638	2019-08-22 13:13:48 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
				25
				26	#include "arm_compute/core/CPP/Validate.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/NEON/NEMath.h"
				31	#include "arm_compute/core/NEON/wrapper/wrapper.h"
				32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Utils.h"
				34	#include "arm_compute/core/Validate.h"
				35	#include "arm_compute/core/Window.h"
				36
				37	#include <arm_neon.h>
				38
				39	namespace arm_compute
				40	{
				41	namespace
				42	{
				43	template <typename T>
				44	void instance_normalization_nchw(ITensor input, ITensor output, float gamma, float beta, float epsilon, const Window &window)
				45	{
				46	/** NEON vector tag type. */
				47	using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
				48
				49	// Clear X/Y dimensions on execution window as we handle the planes manually
				50	Window win = window;
				51	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				52	win.set(Window::DimY, Window::Dimension(0, 1, 1));
				53
				54	constexpr int window_step_x = 16 / sizeof(T);
				55	const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
				56
				57	Iterator input_it(input, win);
				58	execute_window_loop(win, [&](const Coordinates & id)
				59	{
				60	Window win_plane = window;
				61	win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
				62	win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
				63	win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
				64
				65	Iterator input_plane_it(input, win_plane);
				66	Iterator output_plane_it(output, win_plane);
				67
				68	auto sum_h_w = static_cast<T>(0.f);
				69	auto sum_squares_h_w = static_cast<T>(0.f);
				70
				71	execute_window_loop(win_plane, [&](const Coordinates &)
				72	{
				73	const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
				74
				75	auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
				76	auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
				77
				78	// Compute S elements per iteration
				79	int x = window.x().start();
				80	for(; x <= (window.x().end() - window_step_x); x += window_step_x)
				81	{
				82	auto vec_input_val = wrapper::vloadq(input_ptr + x);
				83	vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
				84	vec_sum_squares_h_w = wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
				85	}
				86
				87	auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
				88	auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
				89	for(int i = 0; i < window_step_x / 4; ++i)
				90	{
				91	vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
				92	vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
				93	}
				94	sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
				95	sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
				96
				97	// Compute left-over elements
				98	for(; x < window.x().end(); ++x)
				99	{
				100	const auto value = *(input_ptr + x);
				101	sum_h_w += value;
				102	sum_squares_h_w += value * value;
				103	}
				104	},
				105	input_plane_it, output_plane_it);
				106
				107	const auto mean_h_w = sum_h_w / elements_plane;
				108	const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
				109
				110	const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
				111	const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
				112	const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
				113	const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
				114
				115	execute_window_loop(win_plane, [&](const Coordinates &)
				116	{
				117	auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
				118	auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
				119
				120	// Compute S elements per iteration
				121	int x = window.x().start();
				122	auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
				123	for(; x <= (window.x().end() - window_step_x); x += window_step_x)
				124	{
				125	vec_val = wrapper::vloadq(input_ptr + x);
				126	vec_val = wrapper::vadd(wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
				127	wrapper::vstore(output_ptr + x, vec_val);
				128	}
				129
				130	// Compute left-over elements
				131	for(; x < window.x().end(); ++x)
				132	{
				133	(output_ptr + x) = (((input_ptr + x)) - mean_h_w) * multip_h_w + beta;
				134	}
				135	},
				136	input_plane_it, output_plane_it);
				137	},
				138	input_it);
				139	}
				140
				141	Status validate_arguments(const ITensorInfo input, const ITensorInfo output, float gamma, float beta, float epsilon)
				142	{
				143	ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
				144	ARM_COMPUTE_UNUSED(gamma);
				145	ARM_COMPUTE_UNUSED(beta);
				146	ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
				147
				148	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
				149	ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
				150
				151	if(output != nullptr && output->total_size() != 0)
				152	{
				153	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
				154	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				155	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
				156	}
				157
				158	return Status{};
				159	}
				160
				161	std::tuple<Status, Window> validate_and_configure_window(ITensorInfo input, ITensorInfo output)
				162	{
				163	// We handle the planes manually
				164	Window win = calculate_max_window(*input, Steps(1));
				165
				166	// Output auto initialization if not yet initialized
				167	auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
				168
				169	// NEInstanceNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
				170	Coordinates coord;
				171	coord.set_num_dimensions(output->num_dimensions());
				172	output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
				173	return std::make_pair(Status{}, win);
				174	}
				175	} // namespace
				176
				177	NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
				178	: _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
				179	{
				180	}
				181
				182	void NEInstanceNormalizationLayerKernel::configure(ITensor input, ITensor output, float gamma, float beta, float epsilon)
				183	{
				184	ARM_COMPUTE_ERROR_ON_NULLPTR(input);
				185
				186	_input = input;
				187	_output = output == nullptr ? input : output;
				188	_gamma = gamma;
				189	_beta = beta;
				190	_epsilon = epsilon;
				191
				192	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), gamma, beta, epsilon));
				193
				194	if(_input->info()->data_type() == DataType::F32)
				195	{
				196	_func = &instance_normalization_nchw<float>;
				197	}
				198	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				199	else if(_input->info()->data_type() == DataType::F16)
				200	{
				201	_func = &instance_normalization_nchw<float16_t>;
				202	}
				203	#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				204	else
				205	{
				206	ARM_COMPUTE_ERROR("Unsupported data type");
				207	}
				208
				209	// Configure kernel window
				210	auto win_config = validate_and_configure_window(_input->info(), _output->info());
				211	ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
				212
				213	INEKernel::configure(std::get<1>(win_config));
				214	}
				215
				216	Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo input, const ITensorInfo output, float gamma, float beta, float epsilon)
				217	{
				218	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
				219	ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
				220	return Status{};
				221	}
				222
				223	void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
				224	{
				225	ARM_COMPUTE_UNUSED(info);
				226	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				227	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				228	(*_func)(_input, _output, _gamma, _beta, _epsilon, window);
				229	}
				230	} // namespace arm_compute