Blame - src/core/NEON/kernels/NEMeanStdDevKernel.cpp - ml/ComputeLibrary

blob: 7895b009d6918664edef0a59c2dd422a96571cf9 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Validate.h"
				32
				33	#include <arm_neon.h>
				34	#include <cmath>
				35	#include <tuple>
				36	#include <utility>
				37
				38	using namespace arm_compute;
				39
				40	namespace arm_compute
				41	{
				42	class Coordinates;
				43	} // namespace arm_compute
				44
				45	namespace
				46	{
				47	template <bool calc_sum_squared>
				48	std::pair<uint64x1_t, uint64x1_t> accumulate(const Window &window, Iterator &iterator)
				49	{
				50	uint64x1_t sum = vdup_n_u64(0);
				51	uint64x1_t sum_squared = vdup_n_u64(0);
				52
				53	// Calculate sum
				54	execute_window_loop(window, [&](const Coordinates & id)
				55	{
				56	const uint8x16_t in_data = vld1q_u8(iterator.ptr());
				57
				58	// Sum of the low and high elements of data
				59	const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data));
				60	const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0));
				61	const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
				62
				63	// Update sum
				64	sum = vpadal_u32(sum, tmp2);
				65
				66	if(calc_sum_squared)
				67	{
				68	const uint16x8_t square_data_low = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data));
				69	const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data));
				70
				71	// Sum of the low and high elements of data
				72	const uint32x4_t tmp0_low = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low));
				73	const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high));
				74	const uint32x4_t tmp1 = vaddq_u32(tmp0_low, tmp0_high);
				75	const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
				76
				77	// Update sum
				78	sum_squared = vpadal_u32(sum_squared, tmp2);
				79	}
				80	},
				81	iterator);
				82
				83	return std::make_pair(sum, sum_squared);
				84	}
				85	} // namespace
				86
				87	NEMeanStdDevKernel::NEMeanStdDevKernel()
Giorgio Arena	a261181	2017-07-21 10:08:48 +0100	[diff] [blame]	88	: _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx(), _border_size(0)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	89	{
				90	}
				91
Giorgio Arena	a261181	2017-07-21 10:08:48 +0100	[diff] [blame]	92	BorderSize NEMeanStdDevKernel::border_size() const
				93	{
				94	return _border_size;
				95	}
				96
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	97	void NEMeanStdDevKernel::configure(const IImage input, float mean, uint64_t global_sum, float stddev, uint64_t *global_sum_squared)
				98	{
				99	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
				100	ARM_COMPUTE_ERROR_ON(nullptr == mean);
				101	ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
				102	ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
				103	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				104
				105	_input = input;
				106	_mean = mean;
				107	_stddev = stddev;
				108	_global_sum = global_sum;
				109	_global_sum_squared = global_sum_squared;
				110
				111	constexpr unsigned int num_elems_processed_per_iteration = 16;
				112
Giorgio Arena	fc2817d	2017-06-27 17:26:37 +0100	[diff] [blame]	113	_border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration) - input->info()->dimension(0));
Giorgio Arena	a261181	2017-07-21 10:08:48 +0100	[diff] [blame]	114
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	115	// Configure kernel window
				116	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
				117
				118	update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
				119
				120	INEKernel::configure(win);
				121	}
				122
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	123	void NEMeanStdDevKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	125	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	126	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				127	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				128	Iterator input(_input, window);
				129
				130	uint64x1_t local_sum = vdup_n_u64(0);
				131	uint64x1_t local_sum_squared = vdup_n_u64(0);
				132
				133	if(_stddev != nullptr)
				134	{
				135	std::tie(local_sum, local_sum_squared) = accumulate<true>(window, input);
				136	}
				137	else
				138	{
				139	std::tie(local_sum, local_sum_squared) = accumulate<false>(window, input);
				140	}
				141
				142	const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
				143
				144	// Merge sum and calculate mean and stddev
Michalis Spyrou	07781ac	2017-08-31 15:11:41 +0100	[diff] [blame]	145	std::unique_lock<arm_compute::Mutex> lock(_mtx);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	146
				147	*_global_sum += vget_lane_u64(local_sum, 0);
				148
				149	const float mean = *_global_sum / num_pixels;
				150	*_mean = mean;
				151
				152	if(_stddev != nullptr)
				153	{
				154	const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0);
				155	*_global_sum_squared += tmp_sum_squared;
				156	_stddev = std::sqrt((_global_sum_squared / num_pixels) - (mean * mean));
				157	}
				158
				159	lock.unlock();
				160	}