Blame - src/core/NEON/kernels/NEHarrisCornersKernel.cpp - ml/ComputeLibrary

blob: 233b2baabe690c3c5dd98d85e929df6ccfcf7b40 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/core/Window.h"
				34
				35	#include <algorithm>
				36	#include <arm_neon.h>
				37	#include <cmath>
				38	#include <cstddef>
				39
				40	using namespace arm_compute;
				41
				42	#ifdef ARM_COMPUTE_ENABLE_FP16
				43
				44	template class arm_compute::NEHarrisScoreFP16Kernel<3>;
				45	template class arm_compute::NEHarrisScoreFP16Kernel<5>;
				46	template class arm_compute::NEHarrisScoreFP16Kernel<7>;
				47
				48	namespace fp16
				49	{
				50	inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
				51	{
				52	static const float16x8_t zero = vdupq_n_f16(0.f);
				53
				54	// Trace^2
				55	float16x8_t trace2 = vaddq_f16(gx2, gy2);
				56	trace2 = vmulq_f16(trace2, trace2);
				57
				58	// Det(A)
				59	float16x8_t det = vmulq_f16(gx2, gy2);
				60	det = vfmsq_f16(det, gxgy, gxgy);
				61
				62	// Det(A) - sensitivity * trace^2
				63	const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
				64
				65	// mc > strength_thresh
				66	const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
				67
				68	return vbslq_f16(mask, mc, zero);
				69	}
				70
				71	template <size_t block_size>
				72	inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
				73	float norm_factor)
				74	{
				75	const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
				76
				77	// Normalize
				78	low_gx = vmulq_f16(low_gx, norm_factor_fp16);
				79	low_gy = vmulq_f16(low_gy, norm_factor_fp16);
				80	high_gx = vmulq_f16(high_gx, norm_factor_fp16);
				81	high_gy = vmulq_f16(high_gy, norm_factor_fp16);
				82
				83	float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
				84	float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
				85
				86	gx2 = vfmaq_f16(gx2, gx, gx);
				87	gy2 = vfmaq_f16(gy2, gy, gy);
				88	gxgy = vfmaq_f16(gxgy, gx, gy);
				89
				90	gx = vextq_f16(low_gx, high_gx, 1);
				91	gy = vextq_f16(low_gy, high_gy, 1);
				92
				93	gx2 = vfmaq_f16(gx2, gx, gx);
				94	gy2 = vfmaq_f16(gy2, gy, gy);
				95	gxgy = vfmaq_f16(gxgy, gx, gy);
				96
				97	gx = vextq_f16(low_gx, high_gx, 2);
				98	gy = vextq_f16(low_gy, high_gy, 2);
				99
				100	gx2 = vfmaq_f16(gx2, gx, gx);
				101	gy2 = vfmaq_f16(gy2, gy, gy);
				102	gxgy = vfmaq_f16(gxgy, gx, gy);
				103
				104	if(block_size > 3)
				105	{
				106	gx = vextq_f16(low_gx, high_gx, 3);
				107	gy = vextq_f16(low_gy, high_gy, 3);
				108
				109	gx2 = vfmaq_f16(gx2, gx, gx);
				110	gy2 = vfmaq_f16(gy2, gy, gy);
				111	gxgy = vfmaq_f16(gxgy, gx, gy);
				112
				113	gx = vextq_f16(low_gx, high_gx, 4);
				114	gy = vextq_f16(low_gy, high_gy, 4);
				115
				116	gx2 = vfmaq_f16(gx2, gx, gx);
				117	gy2 = vfmaq_f16(gy2, gy, gy);
				118	gxgy = vfmaq_f16(gxgy, gx, gy);
				119	}
				120
				121	if(block_size == 7)
				122	{
				123	gx = vextq_f16(low_gx, high_gx, 5);
				124	gy = vextq_f16(low_gy, high_gy, 5);
				125
				126	gx2 = vfmaq_f16(gx2, gx, gx);
				127	gy2 = vfmaq_f16(gy2, gy, gy);
				128	gxgy = vfmaq_f16(gxgy, gx, gy);
				129
				130	gx = vextq_f16(low_gx, high_gx, 6);
				131	gy = vextq_f16(low_gy, high_gy, 6);
				132
				133	gx2 = vfmaq_f16(gx2, gx, gx);
				134	gy2 = vfmaq_f16(gy2, gy, gy);
				135	gxgy = vfmaq_f16(gxgy, gx, gy);
				136	}
				137	}
				138
				139	template <size_t block_size>
				140	inline void harris_score_S16_S16_FLOAT(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				141	float strength_thresh)
				142	{
				143	auto gx_ptr_0 = static_cast<const int16_t __restrict>(in1_ptr) - (block_size / 2) (in_stride + 1);
				144	auto gy_ptr_0 = static_cast<const int16_t __restrict>(in2_ptr) - (block_size / 2) (in_stride + 1);
				145	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				146	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				147	const auto output = static_cast<float *__restrict>(out_ptr);
				148
				149	// Gx^2, Gy^2 and Gx*Gy
				150	float16x8_t gx2 = vdupq_n_f16(0.0f);
				151	float16x8_t gy2 = vdupq_n_f16(0.0f);
				152	float16x8_t gxgy = vdupq_n_f16(0.0f);
				153
				154	for(size_t i = 0; i < block_size; ++i)
				155	{
				156	const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
				157	const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
				158	const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
				159	const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
				160	harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				161
				162	// Update gx and gy pointer
				163	gx_ptr_0 += in_stride;
				164	gy_ptr_0 += in_stride;
				165	gx_ptr_1 += in_stride;
				166	gy_ptr_1 += in_stride;
				167	}
				168
				169	// Calculate harris score
				170	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				171
				172	// Store score
				173	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				174	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				175	}
				176
				177	template <size_t block_size>
				178	inline void harris_score_S32_S32_FLOAT(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				179	float strength_thresh)
				180	{
				181	static const float16x8_t zero = vdupq_n_f16(0.0f);
				182
				183	auto gx_ptr_0 = static_cast<const int32_t __restrict>(in1_ptr) - (block_size / 2) (in_stride + 1);
				184	auto gy_ptr_0 = static_cast<const int32_t __restrict>(in2_ptr) - (block_size / 2) (in_stride + 1);
				185	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				186	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				187	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				188	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				189	const auto output = static_cast<float *__restrict>(out_ptr);
				190
				191	// Gx^2, Gy^2 and Gx*Gy
				192	float16x8_t gx2 = zero;
				193	float16x8_t gy2 = zero;
				194	float16x8_t gxgy = zero;
				195
				196	for(size_t i = 0; i < block_size; ++i)
				197	{
				198	const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
				199	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
				200	const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
				201	vget_low_f16(zero));
				202	const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
				203	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
				204	const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
				205	vget_low_f16(zero));
				206	harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				207
				208	// Update gx and gy pointer
				209	gx_ptr_0 += in_stride;
				210	gy_ptr_0 += in_stride;
				211	gx_ptr_1 += in_stride;
				212	gy_ptr_1 += in_stride;
				213	gx_ptr_2 += in_stride;
				214	gy_ptr_2 += in_stride;
				215	}
				216
				217	// Calculate harris score
				218	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				219
				220	// Store score
				221	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				222	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				223	}
				224
				225	template <>
				226	inline void harris_score_S32_S32_FLOAT<7>(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				227	float strength_thresh)
				228	{
				229	static const float16x8_t zero = vdupq_n_f16(0.0f);
				230
				231	auto gx_ptr_0 = static_cast<const int32_t __restrict>(in1_ptr) - 3 (in_stride + 1);
				232	auto gy_ptr_0 = static_cast<const int32_t __restrict>(in2_ptr) - 3 (in_stride + 1);
				233	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				234	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				235	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				236	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				237	const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
				238	const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
				239	const auto output = static_cast<float *__restrict>(out_ptr);
				240
				241	// Gx^2, Gy^2 and Gx*Gy
				242	float16x8_t gx2 = zero;
				243	float16x8_t gy2 = zero;
				244	float16x8_t gxgy = zero;
				245
				246	for(size_t i = 0; i < 7; ++i)
				247	{
				248	const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
				249	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
				250	const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
				251	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
				252	const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
				253	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
				254	const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
				255	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
				256	harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				257
				258	// Update gx and gy pointer
				259	gx_ptr_0 += in_stride;
				260	gy_ptr_0 += in_stride;
				261	gx_ptr_1 += in_stride;
				262	gy_ptr_1 += in_stride;
				263	gx_ptr_2 += in_stride;
				264	gy_ptr_2 += in_stride;
				265	}
				266
				267	// Calculate harris score
				268	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				269
				270	// Store score
				271	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				272	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				273	}
				274
				275	} // namespace fp16
				276
				277	template <int32_t block_size>
				278	BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
				279	{
				280	return _border_size;
				281	}
				282
				283	template <int32_t block_size>
				284	NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
				285	: INEHarrisScoreKernel(), _func(nullptr)
				286	{
				287	}
				288
				289	template <int32_t block_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame^]	290	void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	291	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame^]	292	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	293	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				294	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				295	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				296
				297	Iterator input1(_input1, window);
				298	Iterator input2(_input2, window);
				299	Iterator output(_output, window);
				300
				301	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				302
				303	execute_window_loop(window, [&](const Coordinates & id)
				304	{
				305	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				306	},
				307	input1, input2, output);
				308	}
				309
				310	template <int32_t block_size>
				311	void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				312	bool border_undefined)
				313	{
				314	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				315	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				316	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				317	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				318	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				319	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				320	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				321	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				322
				323	_input1 = input1;
				324	_input2 = input2;
				325	_output = output;
				326	_sensitivity = sensitivity;
				327	_strength_thresh = strength_thresh;
				328	_norm_factor = norm_factor;
				329	_border_size = BorderSize(block_size / 2);
				330
				331	if(input1->info()->data_type() == DataType::S16)
				332	{
				333	_func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
				334	}
				335	else
				336	{
				337	_func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
				338	}
				339
				340	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				341
				342	constexpr unsigned int num_elems_processed_per_iteration = 8;
				343	constexpr unsigned int num_elems_read_per_iteration = 16;
				344	constexpr unsigned int num_elems_written_per_iteration = 8;
				345	constexpr unsigned int num_rows_read_per_iteration = block_size;
				346
				347	// Configure kernel window
				348	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				349	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				350
				351	update_window_and_padding(win,
				352	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				353	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				354	output_access);
				355
				356	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				357	input2->info()->valid_region());
				358
				359	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				360
				361	INEKernel::configure(win);
				362	}
				363
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	364	#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	365
				366	template class arm_compute::NEHarrisScoreKernel<3>;
				367	template class arm_compute::NEHarrisScoreKernel<5>;
				368	template class arm_compute::NEHarrisScoreKernel<7>;
				369	template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
				370	template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
				371	template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
				372
				373	namespace
				374	{
				375	inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
				376	{
				377	// Trace^2
				378	float32x4_t trace2 = vaddq_f32(gx2, gy2);
				379	trace2 = vmulq_f32(trace2, trace2);
				380
				381	// Det(A)
				382	float32x4_t det = vmulq_f32(gx2, gy2);
				383	det = vmlsq_f32(det, gxgy, gxgy);
				384
				385	// Det(A) - sensitivity * trace^2
				386	const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
				387
				388	// mc > strength_thresh
				389	const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
				390
				391	return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
				392	}
				393
				394	inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				395	float32x4_t norm_factor)
				396	{
				397	// Normalize
				398	low_gx = vmulq_f32(low_gx, norm_factor);
				399	low_gy = vmulq_f32(low_gy, norm_factor);
				400	high_gx = vmulq_f32(high_gx, norm_factor);
				401	high_gy = vmulq_f32(high_gy, norm_factor);
				402
				403	const float32x4_t l_gx = low_gx;
				404	const float32x4_t l_gy = low_gy;
				405	const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
				406	const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
				407	const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
				408	const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
				409
				410	// Gx*Gx
				411	gx2 = vmlaq_f32(gx2, l_gx, l_gx);
				412	gx2 = vmlaq_f32(gx2, m_gx, m_gx);
				413	gx2 = vmlaq_f32(gx2, r_gx, r_gx);
				414
				415	// Gy*Gy
				416	gy2 = vmlaq_f32(gy2, l_gy, l_gy);
				417	gy2 = vmlaq_f32(gy2, m_gy, m_gy);
				418	gy2 = vmlaq_f32(gy2, r_gy, r_gy);
				419
				420	// Gx*Gy
				421	gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
				422	gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
				423	gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
				424	}
				425
				426	inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				427	float32x4_t norm_factor)
				428	{
				429	// Normalize
				430	low_gx = vmulq_f32(low_gx, norm_factor);
				431	low_gy = vmulq_f32(low_gy, norm_factor);
				432	high_gx = vmulq_f32(high_gx, norm_factor);
				433	high_gy = vmulq_f32(high_gy, norm_factor);
				434
				435	// L2 values
				436	float32x4_t gx = low_gx;
				437	float32x4_t gy = low_gy;
				438
				439	// Accumulate
				440	gx2 = vmlaq_f32(gx2, gx, gx);
				441	gy2 = vmlaq_f32(gy2, gy, gy);
				442	gxgy = vmlaq_f32(gxgy, gx, gy);
				443
				444	// L1 values
				445	gx = vextq_f32(low_gx, high_gx, 1);
				446	gy = vextq_f32(low_gy, high_gy, 1);
				447
				448	// Accumulate
				449	gx2 = vmlaq_f32(gx2, gx, gx);
				450	gy2 = vmlaq_f32(gy2, gy, gy);
				451	gxgy = vmlaq_f32(gxgy, gx, gy);
				452
				453	// M values
				454	gx = vextq_f32(low_gx, high_gx, 2);
				455	gy = vextq_f32(low_gy, high_gy, 2);
				456
				457	// Accumulate
				458	gx2 = vmlaq_f32(gx2, gx, gx);
				459	gy2 = vmlaq_f32(gy2, gy, gy);
				460	gxgy = vmlaq_f32(gxgy, gx, gy);
				461
				462	// R1 values
				463	gx = vextq_f32(low_gx, high_gx, 3);
				464	gy = vextq_f32(low_gy, high_gy, 3);
				465
				466	// Accumulate
				467	gx2 = vmlaq_f32(gx2, gx, gx);
				468	gy2 = vmlaq_f32(gy2, gy, gy);
				469	gxgy = vmlaq_f32(gxgy, gx, gy);
				470
				471	// R2 values
				472	gx = high_gx;
				473	gy = high_gy;
				474
				475	// Accumulate
				476	gx2 = vmlaq_f32(gx2, gx, gx);
				477	gy2 = vmlaq_f32(gy2, gy, gy);
				478	gxgy = vmlaq_f32(gxgy, gx, gy);
				479	}
				480
				481	inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
				482	float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
				483	{
				484	// Normalize
				485	low_gx = vmulq_f32(low_gx, norm_factor);
				486	low_gy = vmulq_f32(low_gy, norm_factor);
				487	high_gx = vmulq_f32(high_gx, norm_factor);
				488	high_gy = vmulq_f32(high_gy, norm_factor);
				489
				490	// L3 values
				491	float32x4_t gx = low_gx;
				492	float32x4_t gy = low_gy;
				493
				494	// Accumulate
				495	gx2 = vmlaq_f32(gx2, gx, gx);
				496	gy2 = vmlaq_f32(gy2, gy, gy);
				497	gxgy = vmlaq_f32(gxgy, gx, gy);
				498
				499	// L2 values
				500	gx = vextq_f32(low_gx, high_gx, 1);
				501	gy = vextq_f32(low_gy, high_gy, 1);
				502
				503	// Accumulate
				504	gx2 = vmlaq_f32(gx2, gx, gx);
				505	gy2 = vmlaq_f32(gy2, gy, gy);
				506	gxgy = vmlaq_f32(gxgy, gx, gy);
				507
				508	// L1 values
				509	gx = vextq_f32(low_gx, high_gx, 2);
				510	gy = vextq_f32(low_gy, high_gy, 2);
				511
				512	// Accumulate
				513	gx2 = vmlaq_f32(gx2, gx, gx);
				514	gy2 = vmlaq_f32(gy2, gy, gy);
				515	gxgy = vmlaq_f32(gxgy, gx, gy);
				516
				517	// M values
				518	gx = vextq_f32(low_gx, high_gx, 3);
				519	gy = vextq_f32(low_gy, high_gy, 3);
				520
				521	// Accumulate
				522	gx2 = vmlaq_f32(gx2, gx, gx);
				523	gy2 = vmlaq_f32(gy2, gy, gy);
				524	gxgy = vmlaq_f32(gxgy, gx, gy);
				525
				526	// R1 values
				527	gx = high_gx;
				528	gy = high_gy;
				529
				530	// Accumulate
				531	gx2 = vmlaq_f32(gx2, gx, gx);
				532	gy2 = vmlaq_f32(gy2, gy, gy);
				533	gxgy = vmlaq_f32(gxgy, gx, gy);
				534
				535	// Change tmp_low and tmp_high for calculating R2 and R3 values
				536	low_gx = high_gx;
				537	low_gy = high_gy;
				538	high_gx = high_gx1;
				539	high_gy = high_gy1;
				540
				541	// Normalize
				542	high_gx = vmulq_f32(high_gx, norm_factor);
				543	high_gy = vmulq_f32(high_gy, norm_factor);
				544
				545	// R2 values
				546	gx = vextq_f32(low_gx, high_gx, 1);
				547	gy = vextq_f32(low_gy, high_gy, 1);
				548
				549	// Accumulate
				550	gx2 = vmlaq_f32(gx2, gx, gx);
				551	gy2 = vmlaq_f32(gy2, gy, gy);
				552	gxgy = vmlaq_f32(gxgy, gx, gy);
				553
				554	// R3 values
				555	gx = vextq_f32(low_gx, high_gx, 2);
				556	gy = vextq_f32(low_gy, high_gy, 2);
				557
				558	// Accumulate
				559	gx2 = vmlaq_f32(gx2, gx, gx);
				560	gy2 = vmlaq_f32(gy2, gy, gy);
				561	gxgy = vmlaq_f32(gxgy, gx, gy);
				562	}
				563
				564	inline void harris_score3x3_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				565	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				566
				567	{
				568	const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
				569	const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
				570	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				571	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				572	const auto output = static_cast<float *__restrict>(output_ptr);
				573
				574	// Gx^2, Gy^2 and Gx*Gy
				575	float32x4x2_t gx2 =
				576	{
				577	{
				578	vdupq_n_f32(0.0f),
				579	vdupq_n_f32(0.0f)
				580	}
				581	};
				582	float32x4x2_t gy2 =
				583	{
				584	{
				585	vdupq_n_f32(0.0f),
				586	vdupq_n_f32(0.0f)
				587	}
				588	};
				589	float32x4x2_t gxgy =
				590	{
				591	{
				592	vdupq_n_f32(0.0f),
				593	vdupq_n_f32(0.0f)
				594	}
				595	};
				596
				597	// Row0
				598	int16x8x2_t tmp_gx =
				599	{
				600	{
				601	vld1q_s16(gx_ptr_0 - input_stride),
				602	vld1q_s16(gx_ptr_1 - input_stride)
				603	}
				604	};
				605	int16x8x2_t tmp_gy =
				606	{
				607	{
				608	vld1q_s16(gy_ptr_0 - input_stride),
				609	vld1q_s16(gy_ptr_1 - input_stride)
				610	}
				611	};
				612	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				613	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				614	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				615
				616	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				617	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				618	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				619	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				620	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				621
				622	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				623	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				624	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				625	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				626	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				627
				628	// Row1
				629	tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
				630	tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
				631	tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
				632	tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
				633
				634	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				635	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				636	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				637	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				638	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				639
				640	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				641	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				642	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				643	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				644	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				645
				646	// Row2
				647	tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
				648	tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
				649	tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
				650	tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
				651
				652	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				653	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				654	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				655	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				656	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				657
				658	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				659	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				660	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				661	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				662	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				663
				664	// Calculate harris score
				665	const float32x4x2_t mc =
				666	{
				667	{
				668	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				669	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				670	}
				671	};
				672
				673	// Store score
				674	vst1q_f32(output + 0, mc.val[0]);
				675	vst1q_f32(output + 4, mc.val[1]);
				676	}
				677
				678	inline void harris_score3x3_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				679	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				680	{
				681	auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
				682	auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
				683	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				684	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				685	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				686	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				687	const auto output = static_cast<float *__restrict>(output_ptr);
				688	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				689	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				690	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				691
				692	// Gx^2, Gy^2 and Gx*Gy
				693	float32x4x2_t gx2 =
				694	{
				695	{
				696	vdupq_n_f32(0.0f),
				697	vdupq_n_f32(0.0f)
				698	}
				699	};
				700	float32x4x2_t gy2 =
				701	{
				702	{
				703	vdupq_n_f32(0.0f),
				704	vdupq_n_f32(0.0f)
				705	}
				706	};
				707	float32x4x2_t gxgy =
				708	{
				709	{
				710	vdupq_n_f32(0.0f),
				711	vdupq_n_f32(0.0f)
				712	}
				713	};
				714
				715	// Row0
				716	float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
				717	float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
				718	float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				719	float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				720	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				721
				722	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				723	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				724	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
				725	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
				726	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				727
				728	// Row1
				729	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				730	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				731	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				732	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				733	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				734
				735	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				736	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				737	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				738	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				739	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				740
				741	// Row2
				742	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
				743	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
				744	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				745	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				746	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				747
				748	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				749	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				750	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
				751	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
				752	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				753
				754	// Calculate harris score
				755	const float32x4x2_t mc =
				756	{
				757	{
				758	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				759	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				760	}
				761	};
				762
				763	// Store score
				764	vst1q_f32(output + 0, mc.val[0]);
				765	vst1q_f32(output + 4, mc.val[1]);
				766	}
				767
				768	inline void harris_score5x5_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				769	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				770	{
				771	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				772	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				773	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				774	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				775	const auto output = static_cast<float *__restrict>(output_ptr);
				776
				777	// Gx^2, Gy^2 and Gx*Gy
				778	float32x4x2_t gx2 =
				779	{
				780	{
				781	vdupq_n_f32(0.0f),
				782	vdupq_n_f32(0.0f)
				783	}
				784	};
				785	float32x4x2_t gy2 =
				786	{
				787	{
				788	vdupq_n_f32(0.0f),
				789	vdupq_n_f32(0.0f)
				790	}
				791	};
				792	float32x4x2_t gxgy =
				793	{
				794	{
				795	vdupq_n_f32(0.0f),
				796	vdupq_n_f32(0.0f)
				797	}
				798	};
				799	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				800	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				801	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				802
				803	for(int i = 0; i < 5; ++i)
				804	{
				805	const int16x8x2_t tmp_gx =
				806	{
				807	{
				808	vld1q_s16(gx_ptr_0),
				809	vld1q_s16(gx_ptr_1)
				810	}
				811	};
				812	const int16x8x2_t tmp_gy =
				813	{
				814	{
				815	vld1q_s16(gy_ptr_0),
				816	vld1q_s16(gy_ptr_1)
				817	}
				818	};
				819
				820	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				821	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				822	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				823	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				824	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				825
				826	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				827	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				828	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				829	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				830	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				831
				832	// Update gx and gy pointer
				833	gx_ptr_0 += input_stride;
				834	gy_ptr_0 += input_stride;
				835	gx_ptr_1 += input_stride;
				836	gy_ptr_1 += input_stride;
				837	}
				838
				839	// Calculate harris score
				840	const float32x4x2_t mc =
				841	{
				842	{
				843	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				844	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				845	}
				846	};
				847
				848	// Store score
				849	vst1q_f32(output + 0, mc.val[0]);
				850	vst1q_f32(output + 4, mc.val[1]);
				851	}
				852
				853	inline void harris_score5x5_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				854	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				855
				856	{
				857	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				858	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				859	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				860	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				861	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				862	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				863	const auto output = static_cast<float *__restrict>(output_ptr);
				864
				865	// Gx^2, Gy^2 and Gx*Gy
				866	float32x4x2_t gx2 =
				867	{
				868	{
				869	vdupq_n_f32(0.0f),
				870	vdupq_n_f32(0.0f)
				871	}
				872	};
				873	float32x4x2_t gy2 =
				874	{
				875	{
				876	vdupq_n_f32(0.0f),
				877	vdupq_n_f32(0.0f)
				878	}
				879	};
				880	float32x4x2_t gxgy =
				881	{
				882	{
				883	vdupq_n_f32(0.0f),
				884	vdupq_n_f32(0.0f)
				885	}
				886	};
				887	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				888	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				889	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				890
				891	for(int i = 0; i < 5; ++i)
				892	{
				893	const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				894	const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				895	const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				896	const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				897	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				898
				899	const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				900	const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				901	const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				902	const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				903	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				904
				905	// Update gx and gy pointer
				906	gx_ptr_0 += input_stride;
				907	gy_ptr_0 += input_stride;
				908	gx_ptr_1 += input_stride;
				909	gy_ptr_1 += input_stride;
				910	gx_ptr_2 += input_stride;
				911	gy_ptr_2 += input_stride;
				912	}
				913
				914	// Calculate harris score
				915	const float32x4x2_t mc =
				916	{
				917	{
				918	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				919	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				920	}
				921	};
				922
				923	// Store score
				924	vst1q_f32(output + 0, mc.val[0]);
				925	vst1q_f32(output + 4, mc.val[1]);
				926	}
				927
				928	inline void harris_score7x7_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				929	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				930	{
				931	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				932	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				933	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				934	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				935	const auto output = static_cast<float *__restrict>(output_ptr);
				936
				937	// Gx^2, Gy^2 and Gx*Gy
				938	float32x4_t gx2 = vdupq_n_f32(0.0f);
				939	float32x4_t gy2 = vdupq_n_f32(0.0f);
				940	float32x4_t gxgy = vdupq_n_f32(0.0f);
				941	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				942	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				943	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				944
				945	for(int i = 0; i < 7; ++i)
				946	{
				947	const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
				948	const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
				949	const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
				950	const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
				951
				952	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
				953	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
				954	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
				955	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
				956	float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
				957	float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
				958	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				959
				960	// Update gx and gy pointer
				961	gx_ptr_0 += input_stride;
				962	gy_ptr_0 += input_stride;
				963	gx_ptr_1 += input_stride;
				964	gy_ptr_1 += input_stride;
				965	}
				966
				967	// Calculate harris score
				968	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				969
				970	// Store score
				971	vst1q_f32(output, mc);
				972	}
				973
				974	inline void harris_score7x7_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				975	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				976	{
				977	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				978	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				979	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				980	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				981	const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
				982	const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
				983	const auto output = static_cast<float *__restrict>(output_ptr);
				984
				985	// Gx^2, Gy^2 and Gx*Gy
				986	float32x4_t gx2 = vdupq_n_f32(0.0f);
				987	float32x4_t gy2 = vdupq_n_f32(0.0f);
				988	float32x4_t gxgy = vdupq_n_f32(0.0f);
				989	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				990	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				991	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				992
				993	for(int i = 0; i < 7; ++i)
				994	{
				995	const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				996	const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				997	const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				998	const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				999	const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				1000	const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				1001	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				1002
				1003	// Update gx and gy pointer
				1004	gx_ptr_0 += input_stride;
				1005	gy_ptr_0 += input_stride;
				1006	gx_ptr_1 += input_stride;
				1007	gy_ptr_1 += input_stride;
				1008	gx_ptr_2 += input_stride;
				1009	gy_ptr_2 += input_stride;
				1010	}
				1011
				1012	// Calculate harris score
				1013	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				1014
				1015	// Store score
				1016	vst1q_f32(output, mc);
				1017	}
				1018
				1019	} // namespace
				1020
				1021	INEHarrisScoreKernel::INEHarrisScoreKernel()
				1022	: _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
				1023	{
				1024	}
				1025
				1026	template <int32_t block_size>
				1027	NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
				1028	: INEHarrisScoreKernel(), _func(nullptr)
				1029	{
				1030	}
				1031
				1032	template <int32_t block_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame^]	1033	void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1034	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame^]	1035	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1036	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1037	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1038	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				1039
				1040	Iterator input1(_input1, window);
				1041	Iterator input2(_input2, window);
				1042	Iterator output(_output, window);
				1043
				1044	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				1045
				1046	execute_window_loop(window, [&](const Coordinates & id)
				1047	{
				1048	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				1049	},
				1050	input1, input2, output);
				1051	}
				1052
				1053	template <int32_t block_size>
				1054	BorderSize NEHarrisScoreKernel<block_size>::border_size() const
				1055	{
				1056	return _border_size;
				1057	}
				1058
				1059	template <int32_t block_size>
				1060	void NEHarrisScoreKernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				1061	bool border_undefined)
				1062	{
				1063	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				1064	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				1065	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				1066	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				1067	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				1068	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				1069	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				1070	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				1071
				1072	_input1 = input1;
				1073	_input2 = input2;
				1074	_output = output;
				1075	_sensitivity = sensitivity;
				1076	_strength_thresh = strength_thresh;
				1077	_norm_factor = norm_factor;
				1078	_border_size = BorderSize(block_size / 2);
				1079
				1080	if(input1->info()->data_type() == DataType::S16)
				1081	{
				1082	switch(block_size)
				1083	{
				1084	case 3:
				1085	_func = &harris_score3x3_S16_S16_FLOAT;
				1086	break;
				1087	case 5:
				1088	_func = &harris_score5x5_S16_S16_FLOAT;
				1089	break;
				1090	case 7:
				1091	_func = &harris_score7x7_S16_S16_FLOAT;
				1092	break;
				1093	default:
				1094	ARM_COMPUTE_ERROR("Invalid block size");
				1095	break;
				1096	}
				1097	}
				1098	else
				1099	{
				1100	switch(block_size)
				1101	{
				1102	case 3:
				1103	_func = &harris_score3x3_S32_S32_FLOAT;
				1104	break;
				1105	case 5:
				1106	_func = &harris_score5x5_S32_S32_FLOAT;
				1107	break;
				1108	case 7:
				1109	_func = &harris_score7x7_S32_S32_FLOAT;
				1110	break;
				1111	default:
				1112	ARM_COMPUTE_ERROR("Invalid block size");
				1113	break;
				1114	}
				1115	}
				1116
				1117	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				1118
				1119	constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
				1120	constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
				1121	constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
				1122	constexpr unsigned int num_rows_read_per_iteration = block_size;
				1123
				1124	// Configure kernel window
				1125	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1126	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1127
				1128	update_window_and_padding(win,
				1129	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				1130	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				1131	output_access);
				1132
				1133	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				1134	input2->info()->valid_region());
				1135
				1136	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				1137
				1138	INEKernel::configure(win);
				1139	}