Blame - src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp - ml/ComputeLibrary

blob: 599dad6c70501328149ad5e6d856322fec71d71c [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/Validate.h"
				31
				32	#include <arm_neon.h>
				33	#include <cstdint>
				34
				35	using namespace arm_compute;
				36
				37	namespace arm_compute
				38	{
				39	class Coordinates;
				40	} // namespace arm_compute
				41
				42	namespace
				43	{
				44	// Defines for computing atan2
				45	constexpr float SCALE_FACTOR = 0.7111111111111111f;
				46	constexpr float PI = 3.141592653589793f;
				47	constexpr float SCALE_180 = 180.0f / PI;
				48	constexpr float SCALE_360 = SCALE_180 * SCALE_FACTOR;
				49	constexpr float PI_4 = 0.7853981633974483f;
				50	constexpr float COEFF1 = 0.0663f;
				51	constexpr float COEFF2 = 0.2447f;
				52	} // namespace
				53
				54	#ifdef ARM_COMPUTE_ENABLE_FP16
				55	namespace fp16
				56	{
				57	inline float16x8_t inv(float16x8_t x)
				58	{
				59	const float16x8_t estimate = vrecpeq_f16(x);
				60	return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
				61	}
				62
				63	inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
				64	{
				65	static const float16x8_t one = vdupq_n_f16(1.0f);
				66	static const float16x8_t ninety = vdupq_n_f16(90.f * SCALE_FACTOR);
				67	static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
				68	static const float16x8_t piover4 = vdupq_n_f16(PI_4);
				69	static const float16x8_t coeff1 = vdupq_n_f16(COEFF1);
				70	static const float16x8_t coeff2 = vdupq_n_f16(COEFF2);
				71
				72	const float16x8_t abs_gx = vabsq_f16(gx);
				73	const float16x8_t abs_gy = vabsq_f16(gy);
				74	const float16x8_t tmin = vminq_f16(abs_gx, abs_gy);
				75	const float16x8_t tmax = vmaxq_f16(abs_gx, abs_gy);
				76
				77	// z = min(x, y) / max(x, y)
				78	const float16x8_t z = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
				79	const float16x8_t absz = vabsq_f16(z);
				80
				81	// = x * [pi/4 + (1 - \|x\|) * (0.2447 + 0.0663 * \|x\|)]
				82	float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
				83	vsubq_f16(one, absz),
				84	vfmaq_f16(coeff2, coeff1, absz)));
				85
				86	// Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
				87	arctan = vmulq_f16(arctan, scale);
				88
				89	/* If z > 1, result = 90 - result */
				90	return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
				91	}
				92
				93	inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
				94	{
				95	static const float16x8_t scale = vdupq_n_f16(SCALE_360);
				96	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				97	static const float16x8_t zero = vdupq_n_f16(0.0f);
				98	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				99
				100	float16x8_t arctan = atan2_fast(gx, gy, scale);
				101
				102	// Choose correct quadrant
				103	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				104	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				105
				106	return arctan;
				107	}
				108
				109	inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
				110	{
				111	static const float16x8_t scale = vdupq_n_f16(SCALE_180);
				112	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				113	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				114	static const float16x8_t zero = vdupq_n_f16(0.0f);
				115
				116	float16x8_t arctan = atan2_fast(gx, gy, scale);
				117
				118	// Choose correct quadrant
				119	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				120	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				121	arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
				122
				123	return arctan;
				124	}
				125
				126	inline float32x4_t invsqrtv(float32x4_t x)
				127	{
				128	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				129
				130	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				131	sqrt_reciprocal);
				132	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				133	sqrt_reciprocal);
				134
				135	return sqrt_reciprocal;
				136	}
				137
				138	inline float32x4_t sqrtv(float32x4_t x)
				139	{
				140	float32x4_t res = vdupq_n_f32(0.5f);
				141	return vmlaq_f32(res, x, invsqrtv(x));
				142	}
				143
				144	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				145	{
				146	return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2));
				147	}
				148
				149	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				150	{
				151	const int32x4x2_t square_x =
				152	{
				153	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				154	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				155	};
				156
				157	const int32x4x2_t square_y =
				158	{
				159	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				160	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				161	};
				162
				163	const uint32x4x2_t sum =
				164	{
				165	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
				166	vreinterpretq_u32_s32(square_y.val[0])),
				167	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
				168	vreinterpretq_u32_s32(square_y.val[1]))
				169	};
				170
				171	const float32x4x2_t res =
				172	{
				173	sqrtv(vcvtq_f32_u32(sum.val[0])),
				174	sqrtv(vcvtq_f32_u32(sum.val[1]))
				175	};
				176
				177	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				178	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				179	}
				180
				181	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				182	{
				183	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				184
				185	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				186	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				187
				188	// Compute fast atan2
				189	const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
				190
				191	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				192	}
				193
				194	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				195	{
				196	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				197
				198	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				199	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				200
				201	// Compute fast atan2
				202	const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
				203
				204	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				205	}
				206
				207	template <MagnitudeType mag_type>
				208	inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
				209
				210	template <>
				211	inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				212	{
				213	const int16x8x2_t mag =
				214	{
				215	magnitude_l2(in0.val[0], gx.val[0]),
				216	magnitude_l2(in0.val[1], gx.val[1])
				217	};
				218
				219	return mag;
				220	}
				221
				222	template <>
				223	inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				224	{
				225	const int16x8x2_t mag =
				226	{
				227	magnitude_l1(in0.val[0], gx.val[0]),
				228	magnitude_l1(in0.val[1], gx.val[1])
				229	};
				230
				231	return mag;
				232	}
				233
				234	template <PhaseType phase_type>
				235	inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
				236
				237	template <>
				238	inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				239	{
				240	return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
				241	phase_signed(in0.val[1], gx.val[1]));
				242	}
				243
				244	template <>
				245	inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				246	{
				247	return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
				248	phase_unsigned(in0.val[1], gx.val[1]));
				249	}
				250	} // namespace fp16
				251
				252	template <MagnitudeType mag_type, PhaseType phase_type>
				253	NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
				254	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				255	{
				256	}
				257
				258	template <MagnitudeType mag_type, PhaseType phase_type>
				259	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				260	{
				261	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
				262	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
				263	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				264
				265	const bool run_mag = magnitude != nullptr;
				266	const bool run_phase = phase != nullptr;
				267
				268	if(run_mag)
				269	{
				270	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
				271	}
				272
				273	if(run_phase)
				274	{
				275	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
				276	}
				277
				278	_gx = gx;
				279	_gy = gy;
				280	_magnitude = magnitude;
				281	_phase = phase;
				282
				283	if(run_mag && run_phase)
				284	{
				285	/* Run magnitude and phase */
				286	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
				287	}
				288	else if(run_mag)
				289	{
				290	/* Run magnitude */
				291	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
				292	}
				293	else if(run_phase)
				294	{
				295	/* Run phase */
				296	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
				297	}
				298	else
				299	{
				300	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				301	}
				302
				303	const unsigned int num_elems_processed_per_iteration = 16;
				304
				305	// Configure kernel window
				306	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				307	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				308	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				309
				310	update_window_and_padding(win,
				311	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				312	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				313	magnitude_access,
				314	phase_access);
				315
				316	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				317	gy->info()->valid_region());
				318
				319	magnitude_access.set_valid_region(win, valid_region);
				320	phase_access.set_valid_region(win, valid_region);
				321
				322	INEKernel::configure(win);
				323	}
				324
				325	template <MagnitudeType mag_type, PhaseType phase_type>
				326	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
				327	{
				328	Iterator gx(_gx, window);
				329	Iterator gy(_gy, window);
				330	Iterator magnitude(_magnitude, window);
				331
				332	execute_window_loop(window, [&](const Coordinates & id)
				333	{
				334	const int16x8x2_t input1 =
				335	{
				336	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				337	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				338	};
				339
				340	const int16x8x2_t input2 =
				341	{
				342	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				343	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				344	};
				345
				346	// Compute and store magnitude
				347	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				348
				349	/* Store magnitude */
				350	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				351	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				352	},
				353	gx, gy, magnitude);
				354	}
				355
				356	template <MagnitudeType mag_type, PhaseType phase_type>
				357	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
				358	{
				359	Iterator gx(_gx, window);
				360	Iterator gy(_gy, window);
				361	Iterator phase(_phase, window);
				362
				363	execute_window_loop(window, [&](const Coordinates & id)
				364	{
				365	const int16x8x2_t input1 =
				366	{
				367	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				368	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				369	};
				370
				371	const int16x8x2_t input2 =
				372	{
				373	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				374	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				375	};
				376
				377	// Compute and store phase
				378	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				379	},
				380	gx, gy, phase);
				381	}
				382
				383	template <MagnitudeType mag_type, PhaseType phase_type>
				384	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				385	{
				386	Iterator gx(_gx, window);
				387	Iterator gy(_gy, window);
				388	Iterator magnitude(_magnitude, window);
				389	Iterator phase(_phase, window);
				390
				391	execute_window_loop(window, [&](const Coordinates & id)
				392	{
				393	const int16x8x2_t input1 =
				394	{
				395	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				396	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				397	};
				398
				399	const int16x8x2_t input2 =
				400	{
				401	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				402	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				403	};
				404
				405	// Compute and store magnitude
				406	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				407
				408	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				409	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				410
				411	// Compute and store phase
				412	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				413	},
				414	gx, gy, magnitude, phase);
				415	}
				416
				417	template <MagnitudeType mag_type, PhaseType phase_type>
				418	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window)
				419	{
				420	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				421	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				422	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				423
				424	(this->*_func)(window);
				425	}
				426
				427	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				428	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				429	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				430	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	431	#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	432
				433	namespace
				434	{
				435	inline float32x4_t inv(float32x4_t x)
				436	{
				437	float32x4_t result = vrecpeq_f32(x);
				438	result = vmulq_f32(vrecpsq_f32(x, result), result);
				439	return result;
				440	}
				441
				442	inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
				443	{
				444	const float32x4_t zero = vdupq_n_f32(0.0f);
				445	const float32x4_t epsilon = vdupq_n_f32(1e-9f);
				446	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				447	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				448	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				449	const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR);
				450	const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR);
				451	const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
				452	const float32x4_t scale = vdupq_n_f32(SCALE_360);
				453
				454	float32x4_t abs_gx = vabsq_f32(gx);
				455	float32x4_t abs_gy = vabsq_f32(gy);
				456	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				457	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				458	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				459	float32x4_t absz = vabsq_f32(z);
				460	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				461
				462	/* Compute y = pi/4 * x - x(abs(x)-1)(0.2447+0.0663 * abs(x) */
				463	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				464	result = vmulq_f32(result, term);
				465	result = vmlaq_f32(result, piover4, z);
				466
				467	/* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] */
				468	result = vmulq_f32(result, scale);
				469
				470	/* If z > 1, result = 90 - result */
				471	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				472
				473	/* Choose correct quadrant */
				474	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				475	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				476
				477	return result;
				478	}
				479
				480	inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
				481	{
				482	const float32x4_t zero = vdupq_n_f32(0.0f);
				483	const float32x4_t epsilon = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
				484	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				485	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				486	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				487	const float32x4_t ninety = vdupq_n_f32(90.0f);
				488	const float32x4_t oneeighty = vdupq_n_f32(180.0f);
				489	const float32x4_t threesixty = vdupq_n_f32(360.0f);
				490	const float32x4_t scale = vdupq_n_f32(SCALE_180);
				491
				492	float32x4_t abs_gx = vabsq_f32(gx);
				493	float32x4_t abs_gy = vabsq_f32(gy);
				494	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				495	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				496	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				497	float32x4_t absz = vabsq_f32(z);
				498
				499	/* Compute y = pi/4 * z - z(abs(z)-1)(0.2447+0.0663 * abs(z) */
				500	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				501	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				502	result = vmulq_f32(result, term);
				503	result = vmlaq_f32(result, piover4, z);
				504
				505	/* Radians to degrees conversion */
				506	result = vmulq_f32(result, scale);
				507
				508	/* If z > 1, result = 90 - result */
				509	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				510
				511	/* Choose correct quadrant */
				512	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				513	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				514	result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
				515
				516	return result;
				517	}
				518
				519	inline float32x4_t invsqrtv(float32x4_t x)
				520	{
				521	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				522
				523	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				524	sqrt_reciprocal);
				525	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				526	sqrt_reciprocal);
				527
				528	return sqrt_reciprocal;
				529	}
				530
				531	inline float32x4_t sqrtv(float32x4_t x)
				532	{
				533	float32x4_t res = vdupq_n_f32(0.5f);
				534	return vmlaq_f32(res, x, invsqrtv(x));
				535	}
				536
				537	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				538	{
				539	const int32x4x2_t square_x =
				540	{
				541	{
				542	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				543	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				544	}
				545	};
				546
				547	const int32x4x2_t square_y =
				548	{
				549	{
				550	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				551	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				552	}
				553	};
				554
				555	const uint32x4x2_t sum =
				556	{
				557	{
				558	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
				559	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
				560	}
				561	};
				562
				563	const float32x4x2_t res =
				564	{
				565	{
				566	sqrtv(vcvtq_f32_u32(sum.val[0])),
				567	sqrtv(vcvtq_f32_u32(sum.val[1]))
				568	}
				569	};
				570
				571	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				572	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				573	}
				574
				575	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				576	{
				577	int16x8_t gx_abs = vabsq_s16(input1);
				578	int16x8_t gy_abs = vabsq_s16(input2);
				579
				580	/* Saturating add */
				581	return vqaddq_s16(gx_abs, gy_abs);
				582	}
				583
				584	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				585	{
				586	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				587
				588	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				589	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				590	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				591	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				592
				593	/* Compute fast atan2 */
				594	float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
				595	float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low);
				596
				597	angle_high = vaddq_f32(angle_high, zeropointfive);
				598	angle_low = vaddq_f32(angle_low, zeropointfive);
				599
				600	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				601	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				602	}
				603
				604	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				605	{
				606	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				607
				608	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				609	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				610	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				611	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				612
				613	/* Compute fast atan2 */
				614	float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
				615	float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low);
				616
				617	angle_high = vaddq_f32(angle_high, zeropointfive);
				618	angle_low = vaddq_f32(angle_low, zeropointfive);
				619
				620	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				621	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				622	}
				623	} // namespace
				624
				625	template <MagnitudeType mag_type, PhaseType phase_type>
				626	NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
				627	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				628	{
				629	}
				630
				631	template <MagnitudeType mag_type, PhaseType phase_type>
				632	void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				633	{
				634	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
				635	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
				636	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				637
				638	const bool run_mag = magnitude != nullptr;
				639	const bool run_phase = phase != nullptr;
				640
				641	if(run_mag)
				642	{
				643	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
				644	}
				645
				646	if(run_phase)
				647	{
				648	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				649	}
				650
				651	_gx = gx;
				652	_gy = gy;
				653	_magnitude = magnitude;
				654	_phase = phase;
				655
				656	if(run_mag && run_phase)
				657	{
				658	/* Run magnitude and phase */
				659	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
				660	}
				661	else
				662	{
				663	if(run_mag)
				664	{
				665	/* Run magnitude */
				666	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
				667	}
				668	else if(run_phase)
				669	{
				670	/* Run phase */
				671	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
				672	}
				673	else
				674	{
				675	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				676	}
				677	}
				678
				679	constexpr unsigned int num_elems_processed_per_iteration = 16;
				680
				681	// Configure kernel window
				682	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				683	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				684	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				685
				686	update_window_and_padding(win,
				687	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				688	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				689	magnitude_access,
				690	phase_access);
				691
				692	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				693	gy->info()->valid_region());
				694
				695	magnitude_access.set_valid_region(win, valid_region);
				696	phase_access.set_valid_region(win, valid_region);
				697
				698	INEKernel::configure(win);
				699	}
				700
				701	template <MagnitudeType mag_type, PhaseType phase_type>
				702	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
				703	{
				704	Iterator gx(_gx, window);
				705	Iterator gy(_gy, window);
				706	Iterator magnitude(_magnitude, window);
				707
				708	execute_window_loop(window, [&](const Coordinates & id)
				709	{
				710	const int16x8x2_t input1 =
				711	{
				712	{
				713	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				714	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				715	}
				716	};
				717
				718	const int16x8x2_t input2 =
				719	{
				720	{
				721	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				722	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				723	}
				724	};
				725
				726	/* Compute magnitude */
				727	int16x8x2_t mag{ {} };
				728
				729	if(MagnitudeType::L2NORM == mag_type)
				730	{
				731	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				732	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				733	}
				734	else
				735	{
				736	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				737	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				738	}
				739
				740	/* Store magnitude */
				741	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				742	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				743	},
				744	gx, gy, magnitude);
				745	}
				746
				747	template <MagnitudeType mag_type, PhaseType phase_type>
				748	void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
				749	{
				750	Iterator gx(_gx, window);
				751	Iterator gy(_gy, window);
				752	Iterator phase(_phase, window);
				753
				754	execute_window_loop(window, [&](const Coordinates & id)
				755	{
				756	const int16x8x2_t input1 =
				757	{
				758	{
				759	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				760	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				761	}
				762	};
				763
				764	const int16x8x2_t input2 =
				765	{
				766	{
				767	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				768	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				769	}
				770	};
				771
				772	/* Compute phase */
				773	uint8x8x2_t vphase{ {} };
				774
				775	if(PhaseType::SIGNED == phase_type)
				776	{
				777	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				778	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				779	}
				780	else
				781	{
				782	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				783	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				784	}
				785
				786	/* Store phase */
				787	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				788	},
				789	gx, gy, phase);
				790	}
				791
				792	template <MagnitudeType mag_type, PhaseType phase_type>
				793	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				794	{
				795	Iterator gx(_gx, window);
				796	Iterator gy(_gy, window);
				797	Iterator magnitude(_magnitude, window);
				798	Iterator phase(_phase, window);
				799
				800	execute_window_loop(window, [&](const Coordinates & id)
				801	{
				802	const int16x8x2_t input1 =
				803	{
				804	{
				805	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				806	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				807	}
				808	};
				809
				810	const int16x8x2_t input2 =
				811	{
				812	{
				813	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				814	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				815	}
				816	};
				817
				818	/* Compute magnitude */
				819	int16x8x2_t mag{ {} };
				820
				821	if(MagnitudeType::L2NORM == mag_type)
				822	{
				823	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				824	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				825	}
				826	else
				827	{
				828	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				829	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				830	}
				831
				832	/* Store magnitude */
				833	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				834	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				835
				836	/* Compute phase */
				837	uint8x8x2_t vphase{ {} };
				838
				839	if(PhaseType::SIGNED == phase_type)
				840	{
				841	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				842	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				843	}
				844	else
				845	{
				846	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				847	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				848	}
				849
				850	/* Store phase */
				851	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				852	},
				853	gx, gy, magnitude, phase);
				854	}
				855
				856	template <MagnitudeType mag_type, PhaseType phase_type>
				857	void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window)
				858	{
				859	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				860	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				861	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				862
				863	(this->*_func)(window);
				864	}
				865
				866	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				867	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				868	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				869	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;