Blame - src/core/NEON/kernels/NEHarrisCornersKernel.cpp - ml/ComputeLibrary

blob: d44f4ce3b8c72827b8cc3082910c2bc3e6333d23 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/core/Window.h"
				34
				35	#include <algorithm>
				36	#include <arm_neon.h>
				37	#include <cmath>
				38	#include <cstddef>
				39
				40	using namespace arm_compute;
				41
				42	#ifdef ARM_COMPUTE_ENABLE_FP16
				43
				44	template class arm_compute::NEHarrisScoreFP16Kernel<3>;
				45	template class arm_compute::NEHarrisScoreFP16Kernel<5>;
				46	template class arm_compute::NEHarrisScoreFP16Kernel<7>;
				47
				48	namespace fp16
				49	{
				50	inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
				51	{
				52	static const float16x8_t zero = vdupq_n_f16(0.f);
				53
				54	// Trace^2
				55	float16x8_t trace2 = vaddq_f16(gx2, gy2);
				56	trace2 = vmulq_f16(trace2, trace2);
				57
				58	// Det(A)
				59	float16x8_t det = vmulq_f16(gx2, gy2);
				60	det = vfmsq_f16(det, gxgy, gxgy);
				61
				62	// Det(A) - sensitivity * trace^2
				63	const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
				64
				65	// mc > strength_thresh
				66	const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
				67
				68	return vbslq_f16(mask, mc, zero);
				69	}
				70
				71	template <size_t block_size>
				72	inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
				73	float norm_factor)
				74	{
				75	const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
				76
				77	// Normalize
				78	low_gx = vmulq_f16(low_gx, norm_factor_fp16);
				79	low_gy = vmulq_f16(low_gy, norm_factor_fp16);
				80	high_gx = vmulq_f16(high_gx, norm_factor_fp16);
				81	high_gy = vmulq_f16(high_gy, norm_factor_fp16);
				82
				83	float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
				84	float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
				85
				86	gx2 = vfmaq_f16(gx2, gx, gx);
				87	gy2 = vfmaq_f16(gy2, gy, gy);
				88	gxgy = vfmaq_f16(gxgy, gx, gy);
				89
				90	gx = vextq_f16(low_gx, high_gx, 1);
				91	gy = vextq_f16(low_gy, high_gy, 1);
				92
				93	gx2 = vfmaq_f16(gx2, gx, gx);
				94	gy2 = vfmaq_f16(gy2, gy, gy);
				95	gxgy = vfmaq_f16(gxgy, gx, gy);
				96
				97	gx = vextq_f16(low_gx, high_gx, 2);
				98	gy = vextq_f16(low_gy, high_gy, 2);
				99
				100	gx2 = vfmaq_f16(gx2, gx, gx);
				101	gy2 = vfmaq_f16(gy2, gy, gy);
				102	gxgy = vfmaq_f16(gxgy, gx, gy);
				103
				104	if(block_size > 3)
				105	{
				106	gx = vextq_f16(low_gx, high_gx, 3);
				107	gy = vextq_f16(low_gy, high_gy, 3);
				108
				109	gx2 = vfmaq_f16(gx2, gx, gx);
				110	gy2 = vfmaq_f16(gy2, gy, gy);
				111	gxgy = vfmaq_f16(gxgy, gx, gy);
				112
				113	gx = vextq_f16(low_gx, high_gx, 4);
				114	gy = vextq_f16(low_gy, high_gy, 4);
				115
				116	gx2 = vfmaq_f16(gx2, gx, gx);
				117	gy2 = vfmaq_f16(gy2, gy, gy);
				118	gxgy = vfmaq_f16(gxgy, gx, gy);
				119	}
				120
				121	if(block_size == 7)
				122	{
				123	gx = vextq_f16(low_gx, high_gx, 5);
				124	gy = vextq_f16(low_gy, high_gy, 5);
				125
				126	gx2 = vfmaq_f16(gx2, gx, gx);
				127	gy2 = vfmaq_f16(gy2, gy, gy);
				128	gxgy = vfmaq_f16(gxgy, gx, gy);
				129
				130	gx = vextq_f16(low_gx, high_gx, 6);
				131	gy = vextq_f16(low_gy, high_gy, 6);
				132
				133	gx2 = vfmaq_f16(gx2, gx, gx);
				134	gy2 = vfmaq_f16(gy2, gy, gy);
				135	gxgy = vfmaq_f16(gxgy, gx, gy);
				136	}
				137	}
				138
				139	template <size_t block_size>
				140	inline void harris_score_S16_S16_FLOAT(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				141	float strength_thresh)
				142	{
				143	auto gx_ptr_0 = static_cast<const int16_t __restrict>(in1_ptr) - (block_size / 2) (in_stride + 1);
				144	auto gy_ptr_0 = static_cast<const int16_t __restrict>(in2_ptr) - (block_size / 2) (in_stride + 1);
				145	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				146	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				147	const auto output = static_cast<float *__restrict>(out_ptr);
				148
				149	// Gx^2, Gy^2 and Gx*Gy
				150	float16x8_t gx2 = vdupq_n_f16(0.0f);
				151	float16x8_t gy2 = vdupq_n_f16(0.0f);
				152	float16x8_t gxgy = vdupq_n_f16(0.0f);
				153
				154	for(size_t i = 0; i < block_size; ++i)
				155	{
				156	const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
				157	const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
				158	const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
				159	const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
				160	harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				161
				162	// Update gx and gy pointer
				163	gx_ptr_0 += in_stride;
				164	gy_ptr_0 += in_stride;
				165	gx_ptr_1 += in_stride;
				166	gy_ptr_1 += in_stride;
				167	}
				168
				169	// Calculate harris score
				170	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				171
				172	// Store score
				173	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				174	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				175	}
				176
				177	template <size_t block_size>
				178	inline void harris_score_S32_S32_FLOAT(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				179	float strength_thresh)
				180	{
				181	static const float16x8_t zero = vdupq_n_f16(0.0f);
				182
				183	auto gx_ptr_0 = static_cast<const int32_t __restrict>(in1_ptr) - (block_size / 2) (in_stride + 1);
				184	auto gy_ptr_0 = static_cast<const int32_t __restrict>(in2_ptr) - (block_size / 2) (in_stride + 1);
				185	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				186	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				187	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				188	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				189	const auto output = static_cast<float *__restrict>(out_ptr);
				190
				191	// Gx^2, Gy^2 and Gx*Gy
				192	float16x8_t gx2 = zero;
				193	float16x8_t gy2 = zero;
				194	float16x8_t gxgy = zero;
				195
				196	for(size_t i = 0; i < block_size; ++i)
				197	{
				198	const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
				199	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
				200	const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
				201	vget_low_f16(zero));
				202	const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
				203	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
				204	const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
				205	vget_low_f16(zero));
				206	harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				207
				208	// Update gx and gy pointer
				209	gx_ptr_0 += in_stride;
				210	gy_ptr_0 += in_stride;
				211	gx_ptr_1 += in_stride;
				212	gy_ptr_1 += in_stride;
				213	gx_ptr_2 += in_stride;
				214	gy_ptr_2 += in_stride;
				215	}
				216
				217	// Calculate harris score
				218	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				219
				220	// Store score
				221	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				222	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				223	}
				224
				225	template <>
				226	inline void harris_score_S32_S32_FLOAT<7>(const void __restrict in1_ptr, const void __restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
				227	float strength_thresh)
				228	{
				229	static const float16x8_t zero = vdupq_n_f16(0.0f);
				230
				231	auto gx_ptr_0 = static_cast<const int32_t __restrict>(in1_ptr) - 3 (in_stride + 1);
				232	auto gy_ptr_0 = static_cast<const int32_t __restrict>(in2_ptr) - 3 (in_stride + 1);
				233	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				234	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				235	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				236	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				237	const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
				238	const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
				239	const auto output = static_cast<float *__restrict>(out_ptr);
				240
				241	// Gx^2, Gy^2 and Gx*Gy
				242	float16x8_t gx2 = zero;
				243	float16x8_t gy2 = zero;
				244	float16x8_t gxgy = zero;
				245
				246	for(size_t i = 0; i < 7; ++i)
				247	{
				248	const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
				249	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
				250	const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
				251	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
				252	const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
				253	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
				254	const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
				255	vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
				256	harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
				257
				258	// Update gx and gy pointer
				259	gx_ptr_0 += in_stride;
				260	gy_ptr_0 += in_stride;
				261	gx_ptr_1 += in_stride;
				262	gy_ptr_1 += in_stride;
				263	gx_ptr_2 += in_stride;
				264	gy_ptr_2 += in_stride;
				265	}
				266
				267	// Calculate harris score
				268	const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				269
				270	// Store score
				271	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
				272	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
				273	}
				274
				275	} // namespace fp16
				276
				277	template <int32_t block_size>
				278	BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
				279	{
				280	return _border_size;
				281	}
				282
				283	template <int32_t block_size>
				284	NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
				285	: INEHarrisScoreKernel(), _func(nullptr)
				286	{
				287	}
				288
				289	template <int32_t block_size>
				290	void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
				291	{
				292	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				293	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				294	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				295
				296	Iterator input1(_input1, window);
				297	Iterator input2(_input2, window);
				298	Iterator output(_output, window);
				299
				300	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				301
				302	execute_window_loop(window, [&](const Coordinates & id)
				303	{
				304	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				305	},
				306	input1, input2, output);
				307	}
				308
				309	template <int32_t block_size>
				310	void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				311	bool border_undefined)
				312	{
				313	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				314	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				315	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				316	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				317	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				318	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				319	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				320	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				321
				322	_input1 = input1;
				323	_input2 = input2;
				324	_output = output;
				325	_sensitivity = sensitivity;
				326	_strength_thresh = strength_thresh;
				327	_norm_factor = norm_factor;
				328	_border_size = BorderSize(block_size / 2);
				329
				330	if(input1->info()->data_type() == DataType::S16)
				331	{
				332	_func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
				333	}
				334	else
				335	{
				336	_func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
				337	}
				338
				339	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				340
				341	constexpr unsigned int num_elems_processed_per_iteration = 8;
				342	constexpr unsigned int num_elems_read_per_iteration = 16;
				343	constexpr unsigned int num_elems_written_per_iteration = 8;
				344	constexpr unsigned int num_rows_read_per_iteration = block_size;
				345
				346	// Configure kernel window
				347	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				348	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				349
				350	update_window_and_padding(win,
				351	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				352	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				353	output_access);
				354
				355	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				356	input2->info()->valid_region());
				357
				358	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				359
				360	INEKernel::configure(win);
				361	}
				362
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	363	#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	364
				365	template class arm_compute::NEHarrisScoreKernel<3>;
				366	template class arm_compute::NEHarrisScoreKernel<5>;
				367	template class arm_compute::NEHarrisScoreKernel<7>;
				368	template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
				369	template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
				370	template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
				371
				372	namespace
				373	{
				374	inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
				375	{
				376	// Trace^2
				377	float32x4_t trace2 = vaddq_f32(gx2, gy2);
				378	trace2 = vmulq_f32(trace2, trace2);
				379
				380	// Det(A)
				381	float32x4_t det = vmulq_f32(gx2, gy2);
				382	det = vmlsq_f32(det, gxgy, gxgy);
				383
				384	// Det(A) - sensitivity * trace^2
				385	const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
				386
				387	// mc > strength_thresh
				388	const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
				389
				390	return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
				391	}
				392
				393	inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				394	float32x4_t norm_factor)
				395	{
				396	// Normalize
				397	low_gx = vmulq_f32(low_gx, norm_factor);
				398	low_gy = vmulq_f32(low_gy, norm_factor);
				399	high_gx = vmulq_f32(high_gx, norm_factor);
				400	high_gy = vmulq_f32(high_gy, norm_factor);
				401
				402	const float32x4_t l_gx = low_gx;
				403	const float32x4_t l_gy = low_gy;
				404	const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
				405	const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
				406	const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
				407	const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
				408
				409	// Gx*Gx
				410	gx2 = vmlaq_f32(gx2, l_gx, l_gx);
				411	gx2 = vmlaq_f32(gx2, m_gx, m_gx);
				412	gx2 = vmlaq_f32(gx2, r_gx, r_gx);
				413
				414	// Gy*Gy
				415	gy2 = vmlaq_f32(gy2, l_gy, l_gy);
				416	gy2 = vmlaq_f32(gy2, m_gy, m_gy);
				417	gy2 = vmlaq_f32(gy2, r_gy, r_gy);
				418
				419	// Gx*Gy
				420	gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
				421	gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
				422	gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
				423	}
				424
				425	inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				426	float32x4_t norm_factor)
				427	{
				428	// Normalize
				429	low_gx = vmulq_f32(low_gx, norm_factor);
				430	low_gy = vmulq_f32(low_gy, norm_factor);
				431	high_gx = vmulq_f32(high_gx, norm_factor);
				432	high_gy = vmulq_f32(high_gy, norm_factor);
				433
				434	// L2 values
				435	float32x4_t gx = low_gx;
				436	float32x4_t gy = low_gy;
				437
				438	// Accumulate
				439	gx2 = vmlaq_f32(gx2, gx, gx);
				440	gy2 = vmlaq_f32(gy2, gy, gy);
				441	gxgy = vmlaq_f32(gxgy, gx, gy);
				442
				443	// L1 values
				444	gx = vextq_f32(low_gx, high_gx, 1);
				445	gy = vextq_f32(low_gy, high_gy, 1);
				446
				447	// Accumulate
				448	gx2 = vmlaq_f32(gx2, gx, gx);
				449	gy2 = vmlaq_f32(gy2, gy, gy);
				450	gxgy = vmlaq_f32(gxgy, gx, gy);
				451
				452	// M values
				453	gx = vextq_f32(low_gx, high_gx, 2);
				454	gy = vextq_f32(low_gy, high_gy, 2);
				455
				456	// Accumulate
				457	gx2 = vmlaq_f32(gx2, gx, gx);
				458	gy2 = vmlaq_f32(gy2, gy, gy);
				459	gxgy = vmlaq_f32(gxgy, gx, gy);
				460
				461	// R1 values
				462	gx = vextq_f32(low_gx, high_gx, 3);
				463	gy = vextq_f32(low_gy, high_gy, 3);
				464
				465	// Accumulate
				466	gx2 = vmlaq_f32(gx2, gx, gx);
				467	gy2 = vmlaq_f32(gy2, gy, gy);
				468	gxgy = vmlaq_f32(gxgy, gx, gy);
				469
				470	// R2 values
				471	gx = high_gx;
				472	gy = high_gy;
				473
				474	// Accumulate
				475	gx2 = vmlaq_f32(gx2, gx, gx);
				476	gy2 = vmlaq_f32(gy2, gy, gy);
				477	gxgy = vmlaq_f32(gxgy, gx, gy);
				478	}
				479
				480	inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
				481	float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
				482	{
				483	// Normalize
				484	low_gx = vmulq_f32(low_gx, norm_factor);
				485	low_gy = vmulq_f32(low_gy, norm_factor);
				486	high_gx = vmulq_f32(high_gx, norm_factor);
				487	high_gy = vmulq_f32(high_gy, norm_factor);
				488
				489	// L3 values
				490	float32x4_t gx = low_gx;
				491	float32x4_t gy = low_gy;
				492
				493	// Accumulate
				494	gx2 = vmlaq_f32(gx2, gx, gx);
				495	gy2 = vmlaq_f32(gy2, gy, gy);
				496	gxgy = vmlaq_f32(gxgy, gx, gy);
				497
				498	// L2 values
				499	gx = vextq_f32(low_gx, high_gx, 1);
				500	gy = vextq_f32(low_gy, high_gy, 1);
				501
				502	// Accumulate
				503	gx2 = vmlaq_f32(gx2, gx, gx);
				504	gy2 = vmlaq_f32(gy2, gy, gy);
				505	gxgy = vmlaq_f32(gxgy, gx, gy);
				506
				507	// L1 values
				508	gx = vextq_f32(low_gx, high_gx, 2);
				509	gy = vextq_f32(low_gy, high_gy, 2);
				510
				511	// Accumulate
				512	gx2 = vmlaq_f32(gx2, gx, gx);
				513	gy2 = vmlaq_f32(gy2, gy, gy);
				514	gxgy = vmlaq_f32(gxgy, gx, gy);
				515
				516	// M values
				517	gx = vextq_f32(low_gx, high_gx, 3);
				518	gy = vextq_f32(low_gy, high_gy, 3);
				519
				520	// Accumulate
				521	gx2 = vmlaq_f32(gx2, gx, gx);
				522	gy2 = vmlaq_f32(gy2, gy, gy);
				523	gxgy = vmlaq_f32(gxgy, gx, gy);
				524
				525	// R1 values
				526	gx = high_gx;
				527	gy = high_gy;
				528
				529	// Accumulate
				530	gx2 = vmlaq_f32(gx2, gx, gx);
				531	gy2 = vmlaq_f32(gy2, gy, gy);
				532	gxgy = vmlaq_f32(gxgy, gx, gy);
				533
				534	// Change tmp_low and tmp_high for calculating R2 and R3 values
				535	low_gx = high_gx;
				536	low_gy = high_gy;
				537	high_gx = high_gx1;
				538	high_gy = high_gy1;
				539
				540	// Normalize
				541	high_gx = vmulq_f32(high_gx, norm_factor);
				542	high_gy = vmulq_f32(high_gy, norm_factor);
				543
				544	// R2 values
				545	gx = vextq_f32(low_gx, high_gx, 1);
				546	gy = vextq_f32(low_gy, high_gy, 1);
				547
				548	// Accumulate
				549	gx2 = vmlaq_f32(gx2, gx, gx);
				550	gy2 = vmlaq_f32(gy2, gy, gy);
				551	gxgy = vmlaq_f32(gxgy, gx, gy);
				552
				553	// R3 values
				554	gx = vextq_f32(low_gx, high_gx, 2);
				555	gy = vextq_f32(low_gy, high_gy, 2);
				556
				557	// Accumulate
				558	gx2 = vmlaq_f32(gx2, gx, gx);
				559	gy2 = vmlaq_f32(gy2, gy, gy);
				560	gxgy = vmlaq_f32(gxgy, gx, gy);
				561	}
				562
				563	inline void harris_score3x3_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				564	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				565
				566	{
				567	const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
				568	const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
				569	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				570	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				571	const auto output = static_cast<float *__restrict>(output_ptr);
				572
				573	// Gx^2, Gy^2 and Gx*Gy
				574	float32x4x2_t gx2 =
				575	{
				576	{
				577	vdupq_n_f32(0.0f),
				578	vdupq_n_f32(0.0f)
				579	}
				580	};
				581	float32x4x2_t gy2 =
				582	{
				583	{
				584	vdupq_n_f32(0.0f),
				585	vdupq_n_f32(0.0f)
				586	}
				587	};
				588	float32x4x2_t gxgy =
				589	{
				590	{
				591	vdupq_n_f32(0.0f),
				592	vdupq_n_f32(0.0f)
				593	}
				594	};
				595
				596	// Row0
				597	int16x8x2_t tmp_gx =
				598	{
				599	{
				600	vld1q_s16(gx_ptr_0 - input_stride),
				601	vld1q_s16(gx_ptr_1 - input_stride)
				602	}
				603	};
				604	int16x8x2_t tmp_gy =
				605	{
				606	{
				607	vld1q_s16(gy_ptr_0 - input_stride),
				608	vld1q_s16(gy_ptr_1 - input_stride)
				609	}
				610	};
				611	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				612	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				613	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				614
				615	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				616	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				617	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				618	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				619	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				620
				621	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				622	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				623	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				624	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				625	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				626
				627	// Row1
				628	tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
				629	tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
				630	tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
				631	tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
				632
				633	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				634	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				635	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				636	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				637	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				638
				639	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				640	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				641	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				642	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				643	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				644
				645	// Row2
				646	tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
				647	tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
				648	tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
				649	tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
				650
				651	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				652	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				653	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				654	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				655	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				656
				657	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				658	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				659	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				660	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				661	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				662
				663	// Calculate harris score
				664	const float32x4x2_t mc =
				665	{
				666	{
				667	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				668	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				669	}
				670	};
				671
				672	// Store score
				673	vst1q_f32(output + 0, mc.val[0]);
				674	vst1q_f32(output + 4, mc.val[1]);
				675	}
				676
				677	inline void harris_score3x3_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				678	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				679	{
				680	auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
				681	auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
				682	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				683	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				684	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				685	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				686	const auto output = static_cast<float *__restrict>(output_ptr);
				687	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				688	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				689	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				690
				691	// Gx^2, Gy^2 and Gx*Gy
				692	float32x4x2_t gx2 =
				693	{
				694	{
				695	vdupq_n_f32(0.0f),
				696	vdupq_n_f32(0.0f)
				697	}
				698	};
				699	float32x4x2_t gy2 =
				700	{
				701	{
				702	vdupq_n_f32(0.0f),
				703	vdupq_n_f32(0.0f)
				704	}
				705	};
				706	float32x4x2_t gxgy =
				707	{
				708	{
				709	vdupq_n_f32(0.0f),
				710	vdupq_n_f32(0.0f)
				711	}
				712	};
				713
				714	// Row0
				715	float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
				716	float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
				717	float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				718	float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				719	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				720
				721	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				722	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				723	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
				724	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
				725	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				726
				727	// Row1
				728	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				729	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				730	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				731	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				732	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				733
				734	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				735	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				736	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				737	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				738	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				739
				740	// Row2
				741	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
				742	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
				743	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				744	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				745	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				746
				747	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				748	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				749	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
				750	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
				751	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				752
				753	// Calculate harris score
				754	const float32x4x2_t mc =
				755	{
				756	{
				757	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				758	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				759	}
				760	};
				761
				762	// Store score
				763	vst1q_f32(output + 0, mc.val[0]);
				764	vst1q_f32(output + 4, mc.val[1]);
				765	}
				766
				767	inline void harris_score5x5_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				768	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				769	{
				770	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				771	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				772	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				773	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				774	const auto output = static_cast<float *__restrict>(output_ptr);
				775
				776	// Gx^2, Gy^2 and Gx*Gy
				777	float32x4x2_t gx2 =
				778	{
				779	{
				780	vdupq_n_f32(0.0f),
				781	vdupq_n_f32(0.0f)
				782	}
				783	};
				784	float32x4x2_t gy2 =
				785	{
				786	{
				787	vdupq_n_f32(0.0f),
				788	vdupq_n_f32(0.0f)
				789	}
				790	};
				791	float32x4x2_t gxgy =
				792	{
				793	{
				794	vdupq_n_f32(0.0f),
				795	vdupq_n_f32(0.0f)
				796	}
				797	};
				798	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				799	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				800	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				801
				802	for(int i = 0; i < 5; ++i)
				803	{
				804	const int16x8x2_t tmp_gx =
				805	{
				806	{
				807	vld1q_s16(gx_ptr_0),
				808	vld1q_s16(gx_ptr_1)
				809	}
				810	};
				811	const int16x8x2_t tmp_gy =
				812	{
				813	{
				814	vld1q_s16(gy_ptr_0),
				815	vld1q_s16(gy_ptr_1)
				816	}
				817	};
				818
				819	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				820	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				821	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				822	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				823	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				824
				825	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				826	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				827	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				828	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				829	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				830
				831	// Update gx and gy pointer
				832	gx_ptr_0 += input_stride;
				833	gy_ptr_0 += input_stride;
				834	gx_ptr_1 += input_stride;
				835	gy_ptr_1 += input_stride;
				836	}
				837
				838	// Calculate harris score
				839	const float32x4x2_t mc =
				840	{
				841	{
				842	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				843	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				844	}
				845	};
				846
				847	// Store score
				848	vst1q_f32(output + 0, mc.val[0]);
				849	vst1q_f32(output + 4, mc.val[1]);
				850	}
				851
				852	inline void harris_score5x5_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				853	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				854
				855	{
				856	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				857	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				858	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				859	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				860	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				861	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				862	const auto output = static_cast<float *__restrict>(output_ptr);
				863
				864	// Gx^2, Gy^2 and Gx*Gy
				865	float32x4x2_t gx2 =
				866	{
				867	{
				868	vdupq_n_f32(0.0f),
				869	vdupq_n_f32(0.0f)
				870	}
				871	};
				872	float32x4x2_t gy2 =
				873	{
				874	{
				875	vdupq_n_f32(0.0f),
				876	vdupq_n_f32(0.0f)
				877	}
				878	};
				879	float32x4x2_t gxgy =
				880	{
				881	{
				882	vdupq_n_f32(0.0f),
				883	vdupq_n_f32(0.0f)
				884	}
				885	};
				886	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				887	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				888	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				889
				890	for(int i = 0; i < 5; ++i)
				891	{
				892	const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				893	const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				894	const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				895	const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				896	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				897
				898	const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				899	const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				900	const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				901	const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				902	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				903
				904	// Update gx and gy pointer
				905	gx_ptr_0 += input_stride;
				906	gy_ptr_0 += input_stride;
				907	gx_ptr_1 += input_stride;
				908	gy_ptr_1 += input_stride;
				909	gx_ptr_2 += input_stride;
				910	gy_ptr_2 += input_stride;
				911	}
				912
				913	// Calculate harris score
				914	const float32x4x2_t mc =
				915	{
				916	{
				917	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				918	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				919	}
				920	};
				921
				922	// Store score
				923	vst1q_f32(output + 0, mc.val[0]);
				924	vst1q_f32(output + 4, mc.val[1]);
				925	}
				926
				927	inline void harris_score7x7_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				928	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				929	{
				930	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				931	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				932	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				933	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				934	const auto output = static_cast<float *__restrict>(output_ptr);
				935
				936	// Gx^2, Gy^2 and Gx*Gy
				937	float32x4_t gx2 = vdupq_n_f32(0.0f);
				938	float32x4_t gy2 = vdupq_n_f32(0.0f);
				939	float32x4_t gxgy = vdupq_n_f32(0.0f);
				940	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				941	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				942	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				943
				944	for(int i = 0; i < 7; ++i)
				945	{
				946	const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
				947	const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
				948	const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
				949	const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
				950
				951	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
				952	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
				953	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
				954	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
				955	float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
				956	float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
				957	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				958
				959	// Update gx and gy pointer
				960	gx_ptr_0 += input_stride;
				961	gy_ptr_0 += input_stride;
				962	gx_ptr_1 += input_stride;
				963	gy_ptr_1 += input_stride;
				964	}
				965
				966	// Calculate harris score
				967	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				968
				969	// Store score
				970	vst1q_f32(output, mc);
				971	}
				972
				973	inline void harris_score7x7_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				974	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				975	{
				976	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				977	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				978	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				979	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				980	const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
				981	const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
				982	const auto output = static_cast<float *__restrict>(output_ptr);
				983
				984	// Gx^2, Gy^2 and Gx*Gy
				985	float32x4_t gx2 = vdupq_n_f32(0.0f);
				986	float32x4_t gy2 = vdupq_n_f32(0.0f);
				987	float32x4_t gxgy = vdupq_n_f32(0.0f);
				988	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				989	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				990	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				991
				992	for(int i = 0; i < 7; ++i)
				993	{
				994	const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				995	const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				996	const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				997	const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				998	const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				999	const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				1000	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				1001
				1002	// Update gx and gy pointer
				1003	gx_ptr_0 += input_stride;
				1004	gy_ptr_0 += input_stride;
				1005	gx_ptr_1 += input_stride;
				1006	gy_ptr_1 += input_stride;
				1007	gx_ptr_2 += input_stride;
				1008	gy_ptr_2 += input_stride;
				1009	}
				1010
				1011	// Calculate harris score
				1012	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				1013
				1014	// Store score
				1015	vst1q_f32(output, mc);
				1016	}
				1017
				1018	} // namespace
				1019
				1020	INEHarrisScoreKernel::INEHarrisScoreKernel()
				1021	: _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
				1022	{
				1023	}
				1024
				1025	template <int32_t block_size>
				1026	NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
				1027	: INEHarrisScoreKernel(), _func(nullptr)
				1028	{
				1029	}
				1030
				1031	template <int32_t block_size>
				1032	void NEHarrisScoreKernel<block_size>::run(const Window &window)
				1033	{
				1034	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1035	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1036	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				1037
				1038	Iterator input1(_input1, window);
				1039	Iterator input2(_input2, window);
				1040	Iterator output(_output, window);
				1041
				1042	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				1043
				1044	execute_window_loop(window, [&](const Coordinates & id)
				1045	{
				1046	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				1047	},
				1048	input1, input2, output);
				1049	}
				1050
				1051	template <int32_t block_size>
				1052	BorderSize NEHarrisScoreKernel<block_size>::border_size() const
				1053	{
				1054	return _border_size;
				1055	}
				1056
				1057	template <int32_t block_size>
				1058	void NEHarrisScoreKernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				1059	bool border_undefined)
				1060	{
				1061	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				1062	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				1063	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				1064	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				1065	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				1066	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				1067	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				1068	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				1069
				1070	_input1 = input1;
				1071	_input2 = input2;
				1072	_output = output;
				1073	_sensitivity = sensitivity;
				1074	_strength_thresh = strength_thresh;
				1075	_norm_factor = norm_factor;
				1076	_border_size = BorderSize(block_size / 2);
				1077
				1078	if(input1->info()->data_type() == DataType::S16)
				1079	{
				1080	switch(block_size)
				1081	{
				1082	case 3:
				1083	_func = &harris_score3x3_S16_S16_FLOAT;
				1084	break;
				1085	case 5:
				1086	_func = &harris_score5x5_S16_S16_FLOAT;
				1087	break;
				1088	case 7:
				1089	_func = &harris_score7x7_S16_S16_FLOAT;
				1090	break;
				1091	default:
				1092	ARM_COMPUTE_ERROR("Invalid block size");
				1093	break;
				1094	}
				1095	}
				1096	else
				1097	{
				1098	switch(block_size)
				1099	{
				1100	case 3:
				1101	_func = &harris_score3x3_S32_S32_FLOAT;
				1102	break;
				1103	case 5:
				1104	_func = &harris_score5x5_S32_S32_FLOAT;
				1105	break;
				1106	case 7:
				1107	_func = &harris_score7x7_S32_S32_FLOAT;
				1108	break;
				1109	default:
				1110	ARM_COMPUTE_ERROR("Invalid block size");
				1111	break;
				1112	}
				1113	}
				1114
				1115	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				1116
				1117	constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
				1118	constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
				1119	constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
				1120	constexpr unsigned int num_rows_read_per_iteration = block_size;
				1121
				1122	// Configure kernel window
				1123	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1124	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				1125
				1126	update_window_and_padding(win,
				1127	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				1128	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				1129	output_access);
				1130
				1131	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				1132	input2->info()->valid_region());
				1133
				1134	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				1135
				1136	INEKernel::configure(win);
				1137	}