Blame - src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp - ml/ComputeLibrary

blob: 2d7c29d9a0665017d04534ba7a152d413d9a26a6 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/Validate.h"
				31
				32	#include <arm_neon.h>
				33	#include <cstdint>
				34
				35	using namespace arm_compute;
				36
				37	namespace arm_compute
				38	{
				39	class Coordinates;
				40	} // namespace arm_compute
				41
				42	namespace
				43	{
				44	// Defines for computing atan2
				45	constexpr float SCALE_FACTOR = 0.7111111111111111f;
				46	constexpr float PI = 3.141592653589793f;
				47	constexpr float SCALE_180 = 180.0f / PI;
				48	constexpr float SCALE_360 = SCALE_180 * SCALE_FACTOR;
				49	constexpr float PI_4 = 0.7853981633974483f;
				50	constexpr float COEFF1 = 0.0663f;
				51	constexpr float COEFF2 = 0.2447f;
				52	} // namespace
				53
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	54	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	namespace fp16
				56	{
				57	inline float16x8_t inv(float16x8_t x)
				58	{
				59	const float16x8_t estimate = vrecpeq_f16(x);
				60	return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
				61	}
				62
				63	inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
				64	{
				65	static const float16x8_t one = vdupq_n_f16(1.0f);
				66	static const float16x8_t ninety = vdupq_n_f16(90.f * SCALE_FACTOR);
				67	static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
				68	static const float16x8_t piover4 = vdupq_n_f16(PI_4);
				69	static const float16x8_t coeff1 = vdupq_n_f16(COEFF1);
				70	static const float16x8_t coeff2 = vdupq_n_f16(COEFF2);
				71
				72	const float16x8_t abs_gx = vabsq_f16(gx);
				73	const float16x8_t abs_gy = vabsq_f16(gy);
				74	const float16x8_t tmin = vminq_f16(abs_gx, abs_gy);
				75	const float16x8_t tmax = vmaxq_f16(abs_gx, abs_gy);
				76
				77	// z = min(x, y) / max(x, y)
				78	const float16x8_t z = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
				79	const float16x8_t absz = vabsq_f16(z);
				80
				81	// = x * [pi/4 + (1 - \|x\|) * (0.2447 + 0.0663 * \|x\|)]
				82	float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
				83	vsubq_f16(one, absz),
				84	vfmaq_f16(coeff2, coeff1, absz)));
				85
				86	// Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
				87	arctan = vmulq_f16(arctan, scale);
				88
				89	/* If z > 1, result = 90 - result */
				90	return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
				91	}
				92
				93	inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
				94	{
				95	static const float16x8_t scale = vdupq_n_f16(SCALE_360);
				96	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				97	static const float16x8_t zero = vdupq_n_f16(0.0f);
				98	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				99
				100	float16x8_t arctan = atan2_fast(gx, gy, scale);
				101
				102	// Choose correct quadrant
				103	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				104	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				105
				106	return arctan;
				107	}
				108
				109	inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
				110	{
				111	static const float16x8_t scale = vdupq_n_f16(SCALE_180);
				112	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				113	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				114	static const float16x8_t zero = vdupq_n_f16(0.0f);
				115
				116	float16x8_t arctan = atan2_fast(gx, gy, scale);
				117
				118	// Choose correct quadrant
				119	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				120	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				121	arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
				122
				123	return arctan;
				124	}
				125
				126	inline float32x4_t invsqrtv(float32x4_t x)
				127	{
				128	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				129
				130	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				131	sqrt_reciprocal);
				132	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				133	sqrt_reciprocal);
				134
				135	return sqrt_reciprocal;
				136	}
				137
				138	inline float32x4_t sqrtv(float32x4_t x)
				139	{
				140	float32x4_t res = vdupq_n_f32(0.5f);
				141	return vmlaq_f32(res, x, invsqrtv(x));
				142	}
				143
				144	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				145	{
John Richardson	3c5f949	2017-10-04 15:27:37 +0100	[diff] [blame]	146	return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	147	}
				148
				149	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				150	{
				151	const int32x4x2_t square_x =
				152	{
				153	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				154	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				155	};
				156
				157	const int32x4x2_t square_y =
				158	{
				159	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				160	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				161	};
				162
				163	const uint32x4x2_t sum =
				164	{
				165	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
				166	vreinterpretq_u32_s32(square_y.val[0])),
				167	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
				168	vreinterpretq_u32_s32(square_y.val[1]))
				169	};
				170
				171	const float32x4x2_t res =
				172	{
				173	sqrtv(vcvtq_f32_u32(sum.val[0])),
				174	sqrtv(vcvtq_f32_u32(sum.val[1]))
				175	};
				176
				177	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				178	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				179	}
				180
				181	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				182	{
				183	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				184
				185	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				186	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				187
				188	// Compute fast atan2
				189	const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
				190
				191	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				192	}
				193
				194	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				195	{
				196	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				197
				198	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				199	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				200
				201	// Compute fast atan2
				202	const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
				203
				204	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				205	}
				206
				207	template <MagnitudeType mag_type>
				208	inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
				209
				210	template <>
				211	inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				212	{
				213	const int16x8x2_t mag =
				214	{
				215	magnitude_l2(in0.val[0], gx.val[0]),
				216	magnitude_l2(in0.val[1], gx.val[1])
				217	};
				218
				219	return mag;
				220	}
				221
				222	template <>
				223	inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				224	{
				225	const int16x8x2_t mag =
				226	{
				227	magnitude_l1(in0.val[0], gx.val[0]),
				228	magnitude_l1(in0.val[1], gx.val[1])
				229	};
				230
				231	return mag;
				232	}
				233
				234	template <PhaseType phase_type>
				235	inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
				236
				237	template <>
				238	inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				239	{
				240	return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
				241	phase_signed(in0.val[1], gx.val[1]));
				242	}
				243
				244	template <>
				245	inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				246	{
				247	return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
				248	phase_unsigned(in0.val[1], gx.val[1]));
				249	}
				250	} // namespace fp16
				251
				252	template <MagnitudeType mag_type, PhaseType phase_type>
				253	NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
				254	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				255	{
				256	}
				257
				258	template <MagnitudeType mag_type, PhaseType phase_type>
				259	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				260	{
				261	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
				262	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
				263	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				264
				265	const bool run_mag = magnitude != nullptr;
				266	const bool run_phase = phase != nullptr;
				267
				268	if(run_mag)
				269	{
				270	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
				271	}
				272
				273	if(run_phase)
				274	{
				275	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
				276	}
				277
				278	_gx = gx;
				279	_gy = gy;
				280	_magnitude = magnitude;
				281	_phase = phase;
				282
				283	if(run_mag && run_phase)
				284	{
				285	/* Run magnitude and phase */
				286	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
				287	}
				288	else if(run_mag)
				289	{
				290	/* Run magnitude */
				291	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
				292	}
				293	else if(run_phase)
				294	{
				295	/* Run phase */
				296	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
				297	}
				298	else
				299	{
				300	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				301	}
				302
				303	const unsigned int num_elems_processed_per_iteration = 16;
				304
				305	// Configure kernel window
				306	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				307	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				308	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				309
				310	update_window_and_padding(win,
				311	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				312	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				313	magnitude_access,
				314	phase_access);
				315
				316	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				317	gy->info()->valid_region());
				318
				319	magnitude_access.set_valid_region(win, valid_region);
				320	phase_access.set_valid_region(win, valid_region);
				321
				322	INEKernel::configure(win);
				323	}
				324
				325	template <MagnitudeType mag_type, PhaseType phase_type>
				326	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
				327	{
				328	Iterator gx(_gx, window);
				329	Iterator gy(_gy, window);
				330	Iterator magnitude(_magnitude, window);
				331
				332	execute_window_loop(window, [&](const Coordinates & id)
				333	{
				334	const int16x8x2_t input1 =
				335	{
				336	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				337	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				338	};
				339
				340	const int16x8x2_t input2 =
				341	{
				342	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				343	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				344	};
				345
				346	// Compute and store magnitude
				347	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				348
				349	/* Store magnitude */
				350	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				351	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				352	},
				353	gx, gy, magnitude);
				354	}
				355
				356	template <MagnitudeType mag_type, PhaseType phase_type>
				357	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
				358	{
				359	Iterator gx(_gx, window);
				360	Iterator gy(_gy, window);
				361	Iterator phase(_phase, window);
				362
				363	execute_window_loop(window, [&](const Coordinates & id)
				364	{
				365	const int16x8x2_t input1 =
				366	{
				367	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				368	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				369	};
				370
				371	const int16x8x2_t input2 =
				372	{
				373	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				374	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				375	};
				376
				377	// Compute and store phase
				378	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				379	},
				380	gx, gy, phase);
				381	}
				382
				383	template <MagnitudeType mag_type, PhaseType phase_type>
				384	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				385	{
				386	Iterator gx(_gx, window);
				387	Iterator gy(_gy, window);
				388	Iterator magnitude(_magnitude, window);
				389	Iterator phase(_phase, window);
				390
				391	execute_window_loop(window, [&](const Coordinates & id)
				392	{
				393	const int16x8x2_t input1 =
				394	{
				395	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				396	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				397	};
				398
				399	const int16x8x2_t input2 =
				400	{
				401	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				402	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				403	};
				404
				405	// Compute and store magnitude
				406	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				407
				408	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				409	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				410
				411	// Compute and store phase
				412	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				413	},
				414	gx, gy, magnitude, phase);
				415	}
				416
				417	template <MagnitudeType mag_type, PhaseType phase_type>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	418	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	419	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	420	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	421	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				422	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				423	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				424
				425	(this->*_func)(window);
				426	}
				427
				428	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				429	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				430	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				431	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	432	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433
				434	namespace
				435	{
				436	inline float32x4_t inv(float32x4_t x)
				437	{
				438	float32x4_t result = vrecpeq_f32(x);
				439	result = vmulq_f32(vrecpsq_f32(x, result), result);
				440	return result;
				441	}
				442
				443	inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
				444	{
				445	const float32x4_t zero = vdupq_n_f32(0.0f);
				446	const float32x4_t epsilon = vdupq_n_f32(1e-9f);
				447	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				448	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				449	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				450	const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR);
				451	const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR);
				452	const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
				453	const float32x4_t scale = vdupq_n_f32(SCALE_360);
				454
				455	float32x4_t abs_gx = vabsq_f32(gx);
				456	float32x4_t abs_gy = vabsq_f32(gy);
				457	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				458	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				459	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				460	float32x4_t absz = vabsq_f32(z);
				461	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				462
				463	/* Compute y = pi/4 * x - x(abs(x)-1)(0.2447+0.0663 * abs(x) */
				464	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				465	result = vmulq_f32(result, term);
				466	result = vmlaq_f32(result, piover4, z);
				467
				468	/* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] */
				469	result = vmulq_f32(result, scale);
				470
				471	/* If z > 1, result = 90 - result */
				472	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				473
				474	/* Choose correct quadrant */
				475	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				476	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				477
				478	return result;
				479	}
				480
				481	inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
				482	{
				483	const float32x4_t zero = vdupq_n_f32(0.0f);
				484	const float32x4_t epsilon = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
				485	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				486	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				487	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				488	const float32x4_t ninety = vdupq_n_f32(90.0f);
				489	const float32x4_t oneeighty = vdupq_n_f32(180.0f);
				490	const float32x4_t threesixty = vdupq_n_f32(360.0f);
				491	const float32x4_t scale = vdupq_n_f32(SCALE_180);
				492
				493	float32x4_t abs_gx = vabsq_f32(gx);
				494	float32x4_t abs_gy = vabsq_f32(gy);
				495	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				496	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				497	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				498	float32x4_t absz = vabsq_f32(z);
				499
				500	/* Compute y = pi/4 * z - z(abs(z)-1)(0.2447+0.0663 * abs(z) */
				501	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				502	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				503	result = vmulq_f32(result, term);
				504	result = vmlaq_f32(result, piover4, z);
				505
				506	/* Radians to degrees conversion */
				507	result = vmulq_f32(result, scale);
				508
				509	/* If z > 1, result = 90 - result */
				510	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				511
				512	/* Choose correct quadrant */
				513	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				514	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				515	result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
				516
				517	return result;
				518	}
				519
				520	inline float32x4_t invsqrtv(float32x4_t x)
				521	{
				522	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				523
				524	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				525	sqrt_reciprocal);
				526	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				527	sqrt_reciprocal);
				528
				529	return sqrt_reciprocal;
				530	}
				531
				532	inline float32x4_t sqrtv(float32x4_t x)
				533	{
				534	float32x4_t res = vdupq_n_f32(0.5f);
				535	return vmlaq_f32(res, x, invsqrtv(x));
				536	}
				537
				538	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				539	{
				540	const int32x4x2_t square_x =
				541	{
				542	{
				543	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				544	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				545	}
				546	};
				547
				548	const int32x4x2_t square_y =
				549	{
				550	{
				551	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				552	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				553	}
				554	};
				555
				556	const uint32x4x2_t sum =
				557	{
				558	{
				559	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
				560	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
				561	}
				562	};
				563
				564	const float32x4x2_t res =
				565	{
				566	{
				567	sqrtv(vcvtq_f32_u32(sum.val[0])),
				568	sqrtv(vcvtq_f32_u32(sum.val[1]))
				569	}
				570	};
				571
				572	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				573	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				574	}
				575
				576	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				577	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	578	/* Saturating add */
John Richardson	3c5f949	2017-10-04 15:27:37 +0100	[diff] [blame]	579	return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	580	}
				581
				582	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				583	{
				584	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				585
				586	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				587	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				588	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				589	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				590
				591	/* Compute fast atan2 */
				592	float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
				593	float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low);
				594
				595	angle_high = vaddq_f32(angle_high, zeropointfive);
				596	angle_low = vaddq_f32(angle_low, zeropointfive);
				597
				598	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				599	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				600	}
				601
				602	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				603	{
				604	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				605
				606	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				607	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				608	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				609	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				610
				611	/* Compute fast atan2 */
				612	float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
				613	float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low);
				614
				615	angle_high = vaddq_f32(angle_high, zeropointfive);
				616	angle_low = vaddq_f32(angle_low, zeropointfive);
				617
				618	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				619	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				620	}
				621	} // namespace
				622
				623	template <MagnitudeType mag_type, PhaseType phase_type>
				624	NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
				625	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				626	{
				627	}
				628
				629	template <MagnitudeType mag_type, PhaseType phase_type>
				630	void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				631	{
				632	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
				633	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
				634	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				635
				636	const bool run_mag = magnitude != nullptr;
				637	const bool run_phase = phase != nullptr;
				638
				639	if(run_mag)
				640	{
				641	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
				642	}
				643
				644	if(run_phase)
				645	{
				646	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				647	}
				648
				649	_gx = gx;
				650	_gy = gy;
				651	_magnitude = magnitude;
				652	_phase = phase;
				653
				654	if(run_mag && run_phase)
				655	{
				656	/* Run magnitude and phase */
				657	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
				658	}
				659	else
				660	{
				661	if(run_mag)
				662	{
				663	/* Run magnitude */
				664	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
				665	}
				666	else if(run_phase)
				667	{
				668	/* Run phase */
				669	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
				670	}
				671	else
				672	{
				673	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				674	}
				675	}
				676
				677	constexpr unsigned int num_elems_processed_per_iteration = 16;
				678
				679	// Configure kernel window
				680	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				681	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				682	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				683
				684	update_window_and_padding(win,
				685	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				686	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				687	magnitude_access,
				688	phase_access);
				689
				690	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				691	gy->info()->valid_region());
				692
				693	magnitude_access.set_valid_region(win, valid_region);
				694	phase_access.set_valid_region(win, valid_region);
				695
				696	INEKernel::configure(win);
				697	}
				698
				699	template <MagnitudeType mag_type, PhaseType phase_type>
				700	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
				701	{
				702	Iterator gx(_gx, window);
				703	Iterator gy(_gy, window);
				704	Iterator magnitude(_magnitude, window);
				705
				706	execute_window_loop(window, [&](const Coordinates & id)
				707	{
				708	const int16x8x2_t input1 =
				709	{
				710	{
				711	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				712	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				713	}
				714	};
				715
				716	const int16x8x2_t input2 =
				717	{
				718	{
				719	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				720	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				721	}
				722	};
				723
				724	/* Compute magnitude */
				725	int16x8x2_t mag{ {} };
				726
				727	if(MagnitudeType::L2NORM == mag_type)
				728	{
				729	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				730	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				731	}
				732	else
				733	{
				734	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				735	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				736	}
				737
				738	/* Store magnitude */
				739	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				740	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				741	},
				742	gx, gy, magnitude);
				743	}
				744
				745	template <MagnitudeType mag_type, PhaseType phase_type>
				746	void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
				747	{
				748	Iterator gx(_gx, window);
				749	Iterator gy(_gy, window);
				750	Iterator phase(_phase, window);
				751
				752	execute_window_loop(window, [&](const Coordinates & id)
				753	{
				754	const int16x8x2_t input1 =
				755	{
				756	{
				757	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				758	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				759	}
				760	};
				761
				762	const int16x8x2_t input2 =
				763	{
				764	{
				765	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				766	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				767	}
				768	};
				769
				770	/* Compute phase */
				771	uint8x8x2_t vphase{ {} };
				772
				773	if(PhaseType::SIGNED == phase_type)
				774	{
				775	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				776	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				777	}
				778	else
				779	{
				780	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				781	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				782	}
				783
				784	/* Store phase */
				785	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				786	},
				787	gx, gy, phase);
				788	}
				789
				790	template <MagnitudeType mag_type, PhaseType phase_type>
				791	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				792	{
				793	Iterator gx(_gx, window);
				794	Iterator gy(_gy, window);
				795	Iterator magnitude(_magnitude, window);
				796	Iterator phase(_phase, window);
				797
				798	execute_window_loop(window, [&](const Coordinates & id)
				799	{
				800	const int16x8x2_t input1 =
				801	{
				802	{
				803	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				804	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				805	}
				806	};
				807
				808	const int16x8x2_t input2 =
				809	{
				810	{
				811	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				812	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				813	}
				814	};
				815
				816	/* Compute magnitude */
				817	int16x8x2_t mag{ {} };
				818
				819	if(MagnitudeType::L2NORM == mag_type)
				820	{
				821	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				822	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				823	}
				824	else
				825	{
				826	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				827	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				828	}
				829
				830	/* Store magnitude */
				831	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				832	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				833
				834	/* Compute phase */
				835	uint8x8x2_t vphase{ {} };
				836
				837	if(PhaseType::SIGNED == phase_type)
				838	{
				839	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				840	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				841	}
				842	else
				843	{
				844	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				845	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				846	}
				847
				848	/* Store phase */
				849	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				850	},
				851	gx, gy, magnitude, phase);
				852	}
				853
				854	template <MagnitudeType mag_type, PhaseType phase_type>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	855	void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	856	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	857	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	858	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				859	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				860	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				861
				862	(this->*_func)(window);
				863	}
				864
				865	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				866	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				867	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				868	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;