Blame - src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp - ml/ComputeLibrary

blob: 433985f6fab4fc1d681377de8d676a66ba05cbdb [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/Validate.h"
				31
				32	#include <arm_neon.h>
				33	#include <cstdint>
				34
				35	using namespace arm_compute;
				36
				37	namespace arm_compute
				38	{
				39	class Coordinates;
				40	} // namespace arm_compute
				41
				42	namespace
				43	{
				44	// Defines for computing atan2
				45	constexpr float SCALE_FACTOR = 0.7111111111111111f;
				46	constexpr float PI = 3.141592653589793f;
				47	constexpr float SCALE_180 = 180.0f / PI;
				48	constexpr float SCALE_360 = SCALE_180 * SCALE_FACTOR;
				49	constexpr float PI_4 = 0.7853981633974483f;
				50	constexpr float COEFF1 = 0.0663f;
				51	constexpr float COEFF2 = 0.2447f;
				52	} // namespace
				53
				54	#ifdef ARM_COMPUTE_ENABLE_FP16
				55	namespace fp16
				56	{
				57	inline float16x8_t inv(float16x8_t x)
				58	{
				59	const float16x8_t estimate = vrecpeq_f16(x);
				60	return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
				61	}
				62
				63	inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
				64	{
				65	static const float16x8_t one = vdupq_n_f16(1.0f);
				66	static const float16x8_t ninety = vdupq_n_f16(90.f * SCALE_FACTOR);
				67	static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
				68	static const float16x8_t piover4 = vdupq_n_f16(PI_4);
				69	static const float16x8_t coeff1 = vdupq_n_f16(COEFF1);
				70	static const float16x8_t coeff2 = vdupq_n_f16(COEFF2);
				71
				72	const float16x8_t abs_gx = vabsq_f16(gx);
				73	const float16x8_t abs_gy = vabsq_f16(gy);
				74	const float16x8_t tmin = vminq_f16(abs_gx, abs_gy);
				75	const float16x8_t tmax = vmaxq_f16(abs_gx, abs_gy);
				76
				77	// z = min(x, y) / max(x, y)
				78	const float16x8_t z = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
				79	const float16x8_t absz = vabsq_f16(z);
				80
				81	// = x * [pi/4 + (1 - \|x\|) * (0.2447 + 0.0663 * \|x\|)]
				82	float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
				83	vsubq_f16(one, absz),
				84	vfmaq_f16(coeff2, coeff1, absz)));
				85
				86	// Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
				87	arctan = vmulq_f16(arctan, scale);
				88
				89	/* If z > 1, result = 90 - result */
				90	return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
				91	}
				92
				93	inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
				94	{
				95	static const float16x8_t scale = vdupq_n_f16(SCALE_360);
				96	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				97	static const float16x8_t zero = vdupq_n_f16(0.0f);
				98	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				99
				100	float16x8_t arctan = atan2_fast(gx, gy, scale);
				101
				102	// Choose correct quadrant
				103	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				104	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				105
				106	return arctan;
				107	}
				108
				109	inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
				110	{
				111	static const float16x8_t scale = vdupq_n_f16(SCALE_180);
				112	static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
				113	static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR);
				114	static const float16x8_t zero = vdupq_n_f16(0.0f);
				115
				116	float16x8_t arctan = atan2_fast(gx, gy, scale);
				117
				118	// Choose correct quadrant
				119	arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
				120	arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
				121	arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
				122
				123	return arctan;
				124	}
				125
				126	inline float32x4_t invsqrtv(float32x4_t x)
				127	{
				128	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				129
				130	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				131	sqrt_reciprocal);
				132	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				133	sqrt_reciprocal);
				134
				135	return sqrt_reciprocal;
				136	}
				137
				138	inline float32x4_t sqrtv(float32x4_t x)
				139	{
				140	float32x4_t res = vdupq_n_f32(0.5f);
				141	return vmlaq_f32(res, x, invsqrtv(x));
				142	}
				143
				144	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				145	{
				146	return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2));
				147	}
				148
				149	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				150	{
				151	const int32x4x2_t square_x =
				152	{
				153	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				154	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				155	};
				156
				157	const int32x4x2_t square_y =
				158	{
				159	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				160	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				161	};
				162
				163	const uint32x4x2_t sum =
				164	{
				165	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
				166	vreinterpretq_u32_s32(square_y.val[0])),
				167	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
				168	vreinterpretq_u32_s32(square_y.val[1]))
				169	};
				170
				171	const float32x4x2_t res =
				172	{
				173	sqrtv(vcvtq_f32_u32(sum.val[0])),
				174	sqrtv(vcvtq_f32_u32(sum.val[1]))
				175	};
				176
				177	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				178	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				179	}
				180
				181	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				182	{
				183	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				184
				185	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				186	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				187
				188	// Compute fast atan2
				189	const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
				190
				191	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				192	}
				193
				194	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				195	{
				196	static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
				197
				198	const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
				199	const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
				200
				201	// Compute fast atan2
				202	const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
				203
				204	return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
				205	}
				206
				207	template <MagnitudeType mag_type>
				208	inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
				209
				210	template <>
				211	inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				212	{
				213	const int16x8x2_t mag =
				214	{
				215	magnitude_l2(in0.val[0], gx.val[0]),
				216	magnitude_l2(in0.val[1], gx.val[1])
				217	};
				218
				219	return mag;
				220	}
				221
				222	template <>
				223	inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				224	{
				225	const int16x8x2_t mag =
				226	{
				227	magnitude_l1(in0.val[0], gx.val[0]),
				228	magnitude_l1(in0.val[1], gx.val[1])
				229	};
				230
				231	return mag;
				232	}
				233
				234	template <PhaseType phase_type>
				235	inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
				236
				237	template <>
				238	inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				239	{
				240	return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
				241	phase_signed(in0.val[1], gx.val[1]));
				242	}
				243
				244	template <>
				245	inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
				246	{
				247	return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
				248	phase_unsigned(in0.val[1], gx.val[1]));
				249	}
				250	} // namespace fp16
				251
				252	template <MagnitudeType mag_type, PhaseType phase_type>
				253	NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
				254	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				255	{
				256	}
				257
				258	template <MagnitudeType mag_type, PhaseType phase_type>
				259	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				260	{
				261	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
				262	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
				263	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				264
				265	const bool run_mag = magnitude != nullptr;
				266	const bool run_phase = phase != nullptr;
				267
				268	if(run_mag)
				269	{
				270	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
				271	}
				272
				273	if(run_phase)
				274	{
				275	ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
				276	}
				277
				278	_gx = gx;
				279	_gy = gy;
				280	_magnitude = magnitude;
				281	_phase = phase;
				282
				283	if(run_mag && run_phase)
				284	{
				285	/* Run magnitude and phase */
				286	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
				287	}
				288	else if(run_mag)
				289	{
				290	/* Run magnitude */
				291	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
				292	}
				293	else if(run_phase)
				294	{
				295	/* Run phase */
				296	_func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
				297	}
				298	else
				299	{
				300	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				301	}
				302
				303	const unsigned int num_elems_processed_per_iteration = 16;
				304
				305	// Configure kernel window
				306	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				307	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				308	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				309
				310	update_window_and_padding(win,
				311	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				312	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				313	magnitude_access,
				314	phase_access);
				315
				316	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				317	gy->info()->valid_region());
				318
				319	magnitude_access.set_valid_region(win, valid_region);
				320	phase_access.set_valid_region(win, valid_region);
				321
				322	INEKernel::configure(win);
				323	}
				324
				325	template <MagnitudeType mag_type, PhaseType phase_type>
				326	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
				327	{
				328	Iterator gx(_gx, window);
				329	Iterator gy(_gy, window);
				330	Iterator magnitude(_magnitude, window);
				331
				332	execute_window_loop(window, [&](const Coordinates & id)
				333	{
				334	const int16x8x2_t input1 =
				335	{
				336	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				337	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				338	};
				339
				340	const int16x8x2_t input2 =
				341	{
				342	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				343	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				344	};
				345
				346	// Compute and store magnitude
				347	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				348
				349	/* Store magnitude */
				350	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				351	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				352	},
				353	gx, gy, magnitude);
				354	}
				355
				356	template <MagnitudeType mag_type, PhaseType phase_type>
				357	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
				358	{
				359	Iterator gx(_gx, window);
				360	Iterator gy(_gy, window);
				361	Iterator phase(_phase, window);
				362
				363	execute_window_loop(window, [&](const Coordinates & id)
				364	{
				365	const int16x8x2_t input1 =
				366	{
				367	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				368	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				369	};
				370
				371	const int16x8x2_t input2 =
				372	{
				373	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				374	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				375	};
				376
				377	// Compute and store phase
				378	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				379	},
				380	gx, gy, phase);
				381	}
				382
				383	template <MagnitudeType mag_type, PhaseType phase_type>
				384	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				385	{
				386	Iterator gx(_gx, window);
				387	Iterator gy(_gy, window);
				388	Iterator magnitude(_magnitude, window);
				389	Iterator phase(_phase, window);
				390
				391	execute_window_loop(window, [&](const Coordinates & id)
				392	{
				393	const int16x8x2_t input1 =
				394	{
				395	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				396	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				397	};
				398
				399	const int16x8x2_t input2 =
				400	{
				401	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				402	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				403	};
				404
				405	// Compute and store magnitude
				406	const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
				407
				408	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				409	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				410
				411	// Compute and store phase
				412	vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
				413	},
				414	gx, gy, magnitude, phase);
				415	}
				416
				417	template <MagnitudeType mag_type, PhaseType phase_type>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	418	void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	419	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	420	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	421	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				422	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				423	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				424
				425	(this->*_func)(window);
				426	}
				427
				428	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				429	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				430	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				431	template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	432	#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433
				434	namespace
				435	{
				436	inline float32x4_t inv(float32x4_t x)
				437	{
				438	float32x4_t result = vrecpeq_f32(x);
				439	result = vmulq_f32(vrecpsq_f32(x, result), result);
				440	return result;
				441	}
				442
				443	inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
				444	{
				445	const float32x4_t zero = vdupq_n_f32(0.0f);
				446	const float32x4_t epsilon = vdupq_n_f32(1e-9f);
				447	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				448	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				449	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				450	const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR);
				451	const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR);
				452	const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
				453	const float32x4_t scale = vdupq_n_f32(SCALE_360);
				454
				455	float32x4_t abs_gx = vabsq_f32(gx);
				456	float32x4_t abs_gy = vabsq_f32(gy);
				457	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				458	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				459	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				460	float32x4_t absz = vabsq_f32(z);
				461	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				462
				463	/* Compute y = pi/4 * x - x(abs(x)-1)(0.2447+0.0663 * abs(x) */
				464	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				465	result = vmulq_f32(result, term);
				466	result = vmlaq_f32(result, piover4, z);
				467
				468	/* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] */
				469	result = vmulq_f32(result, scale);
				470
				471	/* If z > 1, result = 90 - result */
				472	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				473
				474	/* Choose correct quadrant */
				475	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				476	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				477
				478	return result;
				479	}
				480
				481	inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
				482	{
				483	const float32x4_t zero = vdupq_n_f32(0.0f);
				484	const float32x4_t epsilon = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
				485	const float32x4_t piover4 = vdupq_n_f32(PI_4);
				486	const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
				487	const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
				488	const float32x4_t ninety = vdupq_n_f32(90.0f);
				489	const float32x4_t oneeighty = vdupq_n_f32(180.0f);
				490	const float32x4_t threesixty = vdupq_n_f32(360.0f);
				491	const float32x4_t scale = vdupq_n_f32(SCALE_180);
				492
				493	float32x4_t abs_gx = vabsq_f32(gx);
				494	float32x4_t abs_gy = vabsq_f32(gy);
				495	float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
				496	float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
				497	float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
				498	float32x4_t absz = vabsq_f32(z);
				499
				500	/* Compute y = pi/4 * z - z(abs(z)-1)(0.2447+0.0663 * abs(z) */
				501	float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
				502	float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
				503	result = vmulq_f32(result, term);
				504	result = vmlaq_f32(result, piover4, z);
				505
				506	/* Radians to degrees conversion */
				507	result = vmulq_f32(result, scale);
				508
				509	/* If z > 1, result = 90 - result */
				510	result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
				511
				512	/* Choose correct quadrant */
				513	result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
				514	result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
				515	result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
				516
				517	return result;
				518	}
				519
				520	inline float32x4_t invsqrtv(float32x4_t x)
				521	{
				522	float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
				523
				524	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				525	sqrt_reciprocal);
				526	sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
				527	sqrt_reciprocal);
				528
				529	return sqrt_reciprocal;
				530	}
				531
				532	inline float32x4_t sqrtv(float32x4_t x)
				533	{
				534	float32x4_t res = vdupq_n_f32(0.5f);
				535	return vmlaq_f32(res, x, invsqrtv(x));
				536	}
				537
				538	inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
				539	{
				540	const int32x4x2_t square_x =
				541	{
				542	{
				543	vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
				544	vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
				545	}
				546	};
				547
				548	const int32x4x2_t square_y =
				549	{
				550	{
				551	vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
				552	vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
				553	}
				554	};
				555
				556	const uint32x4x2_t sum =
				557	{
				558	{
				559	vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
				560	vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
				561	}
				562	};
				563
				564	const float32x4x2_t res =
				565	{
				566	{
				567	sqrtv(vcvtq_f32_u32(sum.val[0])),
				568	sqrtv(vcvtq_f32_u32(sum.val[1]))
				569	}
				570	};
				571
				572	return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
				573	vqmovn_s32(vcvtq_s32_f32(res.val[1])));
				574	}
				575
				576	inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
				577	{
				578	int16x8_t gx_abs = vabsq_s16(input1);
				579	int16x8_t gy_abs = vabsq_s16(input2);
				580
				581	/* Saturating add */
				582	return vqaddq_s16(gx_abs, gy_abs);
				583	}
				584
				585	inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
				586	{
				587	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				588
				589	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				590	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				591	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				592	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				593
				594	/* Compute fast atan2 */
				595	float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
				596	float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low);
				597
				598	angle_high = vaddq_f32(angle_high, zeropointfive);
				599	angle_low = vaddq_f32(angle_low, zeropointfive);
				600
				601	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				602	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				603	}
				604
				605	inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
				606	{
				607	const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
				608
				609	float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
				610	float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
				611	float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
				612	float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
				613
				614	/* Compute fast atan2 */
				615	float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
				616	float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low);
				617
				618	angle_high = vaddq_f32(angle_high, zeropointfive);
				619	angle_low = vaddq_f32(angle_low, zeropointfive);
				620
				621	return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
				622	vqmovun_s32(vcvtq_s32_f32(angle_high))));
				623	}
				624	} // namespace
				625
				626	template <MagnitudeType mag_type, PhaseType phase_type>
				627	NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
				628	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				629	{
				630	}
				631
				632	template <MagnitudeType mag_type, PhaseType phase_type>
				633	void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase)
				634	{
				635	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
				636	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
				637	ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
				638
				639	const bool run_mag = magnitude != nullptr;
				640	const bool run_phase = phase != nullptr;
				641
				642	if(run_mag)
				643	{
				644	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
				645	}
				646
				647	if(run_phase)
				648	{
				649	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				650	}
				651
				652	_gx = gx;
				653	_gy = gy;
				654	_magnitude = magnitude;
				655	_phase = phase;
				656
				657	if(run_mag && run_phase)
				658	{
				659	/* Run magnitude and phase */
				660	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
				661	}
				662	else
				663	{
				664	if(run_mag)
				665	{
				666	/* Run magnitude */
				667	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
				668	}
				669	else if(run_phase)
				670	{
				671	/* Run phase */
				672	_func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
				673	}
				674	else
				675	{
				676	ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
				677	}
				678	}
				679
				680	constexpr unsigned int num_elems_processed_per_iteration = 16;
				681
				682	// Configure kernel window
				683	Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
				684	AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
				685	AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
				686
				687	update_window_and_padding(win,
				688	AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
				689	AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
				690	magnitude_access,
				691	phase_access);
				692
				693	ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
				694	gy->info()->valid_region());
				695
				696	magnitude_access.set_valid_region(win, valid_region);
				697	phase_access.set_valid_region(win, valid_region);
				698
				699	INEKernel::configure(win);
				700	}
				701
				702	template <MagnitudeType mag_type, PhaseType phase_type>
				703	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
				704	{
				705	Iterator gx(_gx, window);
				706	Iterator gy(_gy, window);
				707	Iterator magnitude(_magnitude, window);
				708
				709	execute_window_loop(window, [&](const Coordinates & id)
				710	{
				711	const int16x8x2_t input1 =
				712	{
				713	{
				714	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				715	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				716	}
				717	};
				718
				719	const int16x8x2_t input2 =
				720	{
				721	{
				722	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				723	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				724	}
				725	};
				726
				727	/* Compute magnitude */
				728	int16x8x2_t mag{ {} };
				729
				730	if(MagnitudeType::L2NORM == mag_type)
				731	{
				732	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				733	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				734	}
				735	else
				736	{
				737	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				738	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				739	}
				740
				741	/* Store magnitude */
				742	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				743	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				744	},
				745	gx, gy, magnitude);
				746	}
				747
				748	template <MagnitudeType mag_type, PhaseType phase_type>
				749	void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
				750	{
				751	Iterator gx(_gx, window);
				752	Iterator gy(_gy, window);
				753	Iterator phase(_phase, window);
				754
				755	execute_window_loop(window, [&](const Coordinates & id)
				756	{
				757	const int16x8x2_t input1 =
				758	{
				759	{
				760	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				761	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				762	}
				763	};
				764
				765	const int16x8x2_t input2 =
				766	{
				767	{
				768	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				769	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				770	}
				771	};
				772
				773	/* Compute phase */
				774	uint8x8x2_t vphase{ {} };
				775
				776	if(PhaseType::SIGNED == phase_type)
				777	{
				778	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				779	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				780	}
				781	else
				782	{
				783	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				784	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				785	}
				786
				787	/* Store phase */
				788	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				789	},
				790	gx, gy, phase);
				791	}
				792
				793	template <MagnitudeType mag_type, PhaseType phase_type>
				794	void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
				795	{
				796	Iterator gx(_gx, window);
				797	Iterator gy(_gy, window);
				798	Iterator magnitude(_magnitude, window);
				799	Iterator phase(_phase, window);
				800
				801	execute_window_loop(window, [&](const Coordinates & id)
				802	{
				803	const int16x8x2_t input1 =
				804	{
				805	{
				806	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
				807	vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
				808	}
				809	};
				810
				811	const int16x8x2_t input2 =
				812	{
				813	{
				814	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
				815	vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
				816	}
				817	};
				818
				819	/* Compute magnitude */
				820	int16x8x2_t mag{ {} };
				821
				822	if(MagnitudeType::L2NORM == mag_type)
				823	{
				824	mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
				825	mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
				826	}
				827	else
				828	{
				829	mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
				830	mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
				831	}
				832
				833	/* Store magnitude */
				834	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
				835	vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
				836
				837	/* Compute phase */
				838	uint8x8x2_t vphase{ {} };
				839
				840	if(PhaseType::SIGNED == phase_type)
				841	{
				842	vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
				843	vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
				844	}
				845	else
				846	{
				847	vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
				848	vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
				849	}
				850
				851	/* Store phase */
				852	vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
				853	},
				854	gx, gy, magnitude, phase);
				855	}
				856
				857	template <MagnitudeType mag_type, PhaseType phase_type>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	858	void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	859	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	860	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	861	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				862	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				863	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				864
				865	(this->*_func)(window);
				866	}
				867
				868	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
				869	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
				870	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
				871	template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;