Blame - src/core/NEON/kernels/NEHarrisCornersKernel.cpp - ml/ComputeLibrary

blob: 34e68e71cbf717c05ae662930b9d922336dd7ad1 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	2	* Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
				25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/core/Window.h"
				34
				35	#include <algorithm>
				36	#include <arm_neon.h>
				37	#include <cmath>
				38	#include <cstddef>
				39
				40	using namespace arm_compute;
				41
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	42	template class arm_compute::NEHarrisScoreKernel<3>;
				43	template class arm_compute::NEHarrisScoreKernel<5>;
				44	template class arm_compute::NEHarrisScoreKernel<7>;
				45	template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
				46	template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
				47	template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
				48
				49	namespace
				50	{
				51	inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
				52	{
				53	// Trace^2
				54	float32x4_t trace2 = vaddq_f32(gx2, gy2);
				55	trace2 = vmulq_f32(trace2, trace2);
				56
				57	// Det(A)
				58	float32x4_t det = vmulq_f32(gx2, gy2);
				59	det = vmlsq_f32(det, gxgy, gxgy);
				60
				61	// Det(A) - sensitivity * trace^2
				62	const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
				63
				64	// mc > strength_thresh
				65	const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
				66
				67	return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
				68	}
				69
				70	inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				71	float32x4_t norm_factor)
				72	{
				73	// Normalize
				74	low_gx = vmulq_f32(low_gx, norm_factor);
				75	low_gy = vmulq_f32(low_gy, norm_factor);
				76	high_gx = vmulq_f32(high_gx, norm_factor);
				77	high_gy = vmulq_f32(high_gy, norm_factor);
				78
				79	const float32x4_t l_gx = low_gx;
				80	const float32x4_t l_gy = low_gy;
				81	const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
				82	const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
				83	const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
				84	const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
				85
				86	// Gx*Gx
				87	gx2 = vmlaq_f32(gx2, l_gx, l_gx);
				88	gx2 = vmlaq_f32(gx2, m_gx, m_gx);
				89	gx2 = vmlaq_f32(gx2, r_gx, r_gx);
				90
				91	// Gy*Gy
				92	gy2 = vmlaq_f32(gy2, l_gy, l_gy);
				93	gy2 = vmlaq_f32(gy2, m_gy, m_gy);
				94	gy2 = vmlaq_f32(gy2, r_gy, r_gy);
				95
				96	// Gx*Gy
				97	gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
				98	gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
				99	gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
				100	}
				101
				102	inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				103	float32x4_t norm_factor)
				104	{
				105	// Normalize
				106	low_gx = vmulq_f32(low_gx, norm_factor);
				107	low_gy = vmulq_f32(low_gy, norm_factor);
				108	high_gx = vmulq_f32(high_gx, norm_factor);
				109	high_gy = vmulq_f32(high_gy, norm_factor);
				110
				111	// L2 values
				112	float32x4_t gx = low_gx;
				113	float32x4_t gy = low_gy;
				114
				115	// Accumulate
				116	gx2 = vmlaq_f32(gx2, gx, gx);
				117	gy2 = vmlaq_f32(gy2, gy, gy);
				118	gxgy = vmlaq_f32(gxgy, gx, gy);
				119
				120	// L1 values
				121	gx = vextq_f32(low_gx, high_gx, 1);
				122	gy = vextq_f32(low_gy, high_gy, 1);
				123
				124	// Accumulate
				125	gx2 = vmlaq_f32(gx2, gx, gx);
				126	gy2 = vmlaq_f32(gy2, gy, gy);
				127	gxgy = vmlaq_f32(gxgy, gx, gy);
				128
				129	// M values
				130	gx = vextq_f32(low_gx, high_gx, 2);
				131	gy = vextq_f32(low_gy, high_gy, 2);
				132
				133	// Accumulate
				134	gx2 = vmlaq_f32(gx2, gx, gx);
				135	gy2 = vmlaq_f32(gy2, gy, gy);
				136	gxgy = vmlaq_f32(gxgy, gx, gy);
				137
				138	// R1 values
				139	gx = vextq_f32(low_gx, high_gx, 3);
				140	gy = vextq_f32(low_gy, high_gy, 3);
				141
				142	// Accumulate
				143	gx2 = vmlaq_f32(gx2, gx, gx);
				144	gy2 = vmlaq_f32(gy2, gy, gy);
				145	gxgy = vmlaq_f32(gxgy, gx, gy);
				146
				147	// R2 values
				148	gx = high_gx;
				149	gy = high_gy;
				150
				151	// Accumulate
				152	gx2 = vmlaq_f32(gx2, gx, gx);
				153	gy2 = vmlaq_f32(gy2, gy, gy);
				154	gxgy = vmlaq_f32(gxgy, gx, gy);
				155	}
				156
				157	inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
				158	float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
				159	{
				160	// Normalize
				161	low_gx = vmulq_f32(low_gx, norm_factor);
				162	low_gy = vmulq_f32(low_gy, norm_factor);
				163	high_gx = vmulq_f32(high_gx, norm_factor);
				164	high_gy = vmulq_f32(high_gy, norm_factor);
				165
				166	// L3 values
				167	float32x4_t gx = low_gx;
				168	float32x4_t gy = low_gy;
				169
				170	// Accumulate
				171	gx2 = vmlaq_f32(gx2, gx, gx);
				172	gy2 = vmlaq_f32(gy2, gy, gy);
				173	gxgy = vmlaq_f32(gxgy, gx, gy);
				174
				175	// L2 values
				176	gx = vextq_f32(low_gx, high_gx, 1);
				177	gy = vextq_f32(low_gy, high_gy, 1);
				178
				179	// Accumulate
				180	gx2 = vmlaq_f32(gx2, gx, gx);
				181	gy2 = vmlaq_f32(gy2, gy, gy);
				182	gxgy = vmlaq_f32(gxgy, gx, gy);
				183
				184	// L1 values
				185	gx = vextq_f32(low_gx, high_gx, 2);
				186	gy = vextq_f32(low_gy, high_gy, 2);
				187
				188	// Accumulate
				189	gx2 = vmlaq_f32(gx2, gx, gx);
				190	gy2 = vmlaq_f32(gy2, gy, gy);
				191	gxgy = vmlaq_f32(gxgy, gx, gy);
				192
				193	// M values
				194	gx = vextq_f32(low_gx, high_gx, 3);
				195	gy = vextq_f32(low_gy, high_gy, 3);
				196
				197	// Accumulate
				198	gx2 = vmlaq_f32(gx2, gx, gx);
				199	gy2 = vmlaq_f32(gy2, gy, gy);
				200	gxgy = vmlaq_f32(gxgy, gx, gy);
				201
				202	// R1 values
				203	gx = high_gx;
				204	gy = high_gy;
				205
				206	// Accumulate
				207	gx2 = vmlaq_f32(gx2, gx, gx);
				208	gy2 = vmlaq_f32(gy2, gy, gy);
				209	gxgy = vmlaq_f32(gxgy, gx, gy);
				210
				211	// Change tmp_low and tmp_high for calculating R2 and R3 values
				212	low_gx = high_gx;
				213	low_gy = high_gy;
				214	high_gx = high_gx1;
				215	high_gy = high_gy1;
				216
				217	// Normalize
				218	high_gx = vmulq_f32(high_gx, norm_factor);
				219	high_gy = vmulq_f32(high_gy, norm_factor);
				220
				221	// R2 values
				222	gx = vextq_f32(low_gx, high_gx, 1);
				223	gy = vextq_f32(low_gy, high_gy, 1);
				224
				225	// Accumulate
				226	gx2 = vmlaq_f32(gx2, gx, gx);
				227	gy2 = vmlaq_f32(gy2, gy, gy);
				228	gxgy = vmlaq_f32(gxgy, gx, gy);
				229
				230	// R3 values
				231	gx = vextq_f32(low_gx, high_gx, 2);
				232	gy = vextq_f32(low_gy, high_gy, 2);
				233
				234	// Accumulate
				235	gx2 = vmlaq_f32(gx2, gx, gx);
				236	gy2 = vmlaq_f32(gy2, gy, gy);
				237	gxgy = vmlaq_f32(gxgy, gx, gy);
				238	}
				239
				240	inline void harris_score3x3_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				241	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				242
				243	{
				244	const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
				245	const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
				246	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				247	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				248	const auto output = static_cast<float *__restrict>(output_ptr);
				249
				250	// Gx^2, Gy^2 and Gx*Gy
				251	float32x4x2_t gx2 =
				252	{
				253	{
				254	vdupq_n_f32(0.0f),
				255	vdupq_n_f32(0.0f)
				256	}
				257	};
				258	float32x4x2_t gy2 =
				259	{
				260	{
				261	vdupq_n_f32(0.0f),
				262	vdupq_n_f32(0.0f)
				263	}
				264	};
				265	float32x4x2_t gxgy =
				266	{
				267	{
				268	vdupq_n_f32(0.0f),
				269	vdupq_n_f32(0.0f)
				270	}
				271	};
				272
				273	// Row0
				274	int16x8x2_t tmp_gx =
				275	{
				276	{
				277	vld1q_s16(gx_ptr_0 - input_stride),
				278	vld1q_s16(gx_ptr_1 - input_stride)
				279	}
				280	};
				281	int16x8x2_t tmp_gy =
				282	{
				283	{
				284	vld1q_s16(gy_ptr_0 - input_stride),
				285	vld1q_s16(gy_ptr_1 - input_stride)
				286	}
				287	};
				288	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				289	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				290	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				291
				292	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				293	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				294	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				295	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				296	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				297
				298	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				299	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				300	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				301	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				302	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				303
				304	// Row1
				305	tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
				306	tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
				307	tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
				308	tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
				309
				310	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				311	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				312	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				313	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				314	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				315
				316	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				317	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				318	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				319	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				320	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				321
				322	// Row2
				323	tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
				324	tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
				325	tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
				326	tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
				327
				328	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				329	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				330	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				331	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				332	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				333
				334	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				335	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				336	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				337	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				338	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				339
				340	// Calculate harris score
				341	const float32x4x2_t mc =
				342	{
				343	{
				344	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				345	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				346	}
				347	};
				348
				349	// Store score
				350	vst1q_f32(output + 0, mc.val[0]);
				351	vst1q_f32(output + 4, mc.val[1]);
				352	}
				353
				354	inline void harris_score3x3_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				355	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				356	{
				357	auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
				358	auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
				359	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				360	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				361	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				362	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				363	const auto output = static_cast<float *__restrict>(output_ptr);
				364	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				365	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				366	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				367
				368	// Gx^2, Gy^2 and Gx*Gy
				369	float32x4x2_t gx2 =
				370	{
				371	{
				372	vdupq_n_f32(0.0f),
				373	vdupq_n_f32(0.0f)
				374	}
				375	};
				376	float32x4x2_t gy2 =
				377	{
				378	{
				379	vdupq_n_f32(0.0f),
				380	vdupq_n_f32(0.0f)
				381	}
				382	};
				383	float32x4x2_t gxgy =
				384	{
				385	{
				386	vdupq_n_f32(0.0f),
				387	vdupq_n_f32(0.0f)
				388	}
				389	};
				390
				391	// Row0
				392	float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
				393	float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
				394	float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				395	float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				396	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				397
				398	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				399	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				400	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
				401	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
				402	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				403
				404	// Row1
				405	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				406	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				407	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				408	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				409	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				410
				411	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				412	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				413	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				414	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				415	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				416
				417	// Row2
				418	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
				419	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
				420	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				421	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				422	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				423
				424	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				425	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				426	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
				427	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
				428	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				429
				430	// Calculate harris score
				431	const float32x4x2_t mc =
				432	{
				433	{
				434	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				435	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				436	}
				437	};
				438
				439	// Store score
				440	vst1q_f32(output + 0, mc.val[0]);
				441	vst1q_f32(output + 4, mc.val[1]);
				442	}
				443
				444	inline void harris_score5x5_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				445	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				446	{
				447	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				448	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				449	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				450	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				451	const auto output = static_cast<float *__restrict>(output_ptr);
				452
				453	// Gx^2, Gy^2 and Gx*Gy
				454	float32x4x2_t gx2 =
				455	{
				456	{
				457	vdupq_n_f32(0.0f),
				458	vdupq_n_f32(0.0f)
				459	}
				460	};
				461	float32x4x2_t gy2 =
				462	{
				463	{
				464	vdupq_n_f32(0.0f),
				465	vdupq_n_f32(0.0f)
				466	}
				467	};
				468	float32x4x2_t gxgy =
				469	{
				470	{
				471	vdupq_n_f32(0.0f),
				472	vdupq_n_f32(0.0f)
				473	}
				474	};
				475	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				476	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				477	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				478
				479	for(int i = 0; i < 5; ++i)
				480	{
				481	const int16x8x2_t tmp_gx =
				482	{
				483	{
				484	vld1q_s16(gx_ptr_0),
				485	vld1q_s16(gx_ptr_1)
				486	}
				487	};
				488	const int16x8x2_t tmp_gy =
				489	{
				490	{
				491	vld1q_s16(gy_ptr_0),
				492	vld1q_s16(gy_ptr_1)
				493	}
				494	};
				495
				496	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				497	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				498	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				499	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				500	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				501
				502	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				503	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				504	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				505	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				506	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				507
				508	// Update gx and gy pointer
				509	gx_ptr_0 += input_stride;
				510	gy_ptr_0 += input_stride;
				511	gx_ptr_1 += input_stride;
				512	gy_ptr_1 += input_stride;
				513	}
				514
				515	// Calculate harris score
				516	const float32x4x2_t mc =
				517	{
				518	{
				519	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				520	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				521	}
				522	};
				523
				524	// Store score
				525	vst1q_f32(output + 0, mc.val[0]);
				526	vst1q_f32(output + 4, mc.val[1]);
				527	}
				528
				529	inline void harris_score5x5_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				530	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				531
				532	{
				533	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				534	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				535	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				536	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				537	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				538	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				539	const auto output = static_cast<float *__restrict>(output_ptr);
				540
				541	// Gx^2, Gy^2 and Gx*Gy
				542	float32x4x2_t gx2 =
				543	{
				544	{
				545	vdupq_n_f32(0.0f),
				546	vdupq_n_f32(0.0f)
				547	}
				548	};
				549	float32x4x2_t gy2 =
				550	{
				551	{
				552	vdupq_n_f32(0.0f),
				553	vdupq_n_f32(0.0f)
				554	}
				555	};
				556	float32x4x2_t gxgy =
				557	{
				558	{
				559	vdupq_n_f32(0.0f),
				560	vdupq_n_f32(0.0f)
				561	}
				562	};
				563	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				564	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				565	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				566
				567	for(int i = 0; i < 5; ++i)
				568	{
				569	const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				570	const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				571	const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				572	const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				573	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				574
				575	const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				576	const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				577	const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				578	const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				579	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				580
				581	// Update gx and gy pointer
				582	gx_ptr_0 += input_stride;
				583	gy_ptr_0 += input_stride;
				584	gx_ptr_1 += input_stride;
				585	gy_ptr_1 += input_stride;
				586	gx_ptr_2 += input_stride;
				587	gy_ptr_2 += input_stride;
				588	}
				589
				590	// Calculate harris score
				591	const float32x4x2_t mc =
				592	{
				593	{
				594	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				595	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				596	}
				597	};
				598
				599	// Store score
				600	vst1q_f32(output + 0, mc.val[0]);
				601	vst1q_f32(output + 4, mc.val[1]);
				602	}
				603
				604	inline void harris_score7x7_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				605	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				606	{
				607	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				608	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				609	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				610	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				611	const auto output = static_cast<float *__restrict>(output_ptr);
				612
				613	// Gx^2, Gy^2 and Gx*Gy
				614	float32x4_t gx2 = vdupq_n_f32(0.0f);
				615	float32x4_t gy2 = vdupq_n_f32(0.0f);
				616	float32x4_t gxgy = vdupq_n_f32(0.0f);
				617	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				618	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				619	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				620
				621	for(int i = 0; i < 7; ++i)
				622	{
				623	const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
				624	const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
				625	const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
				626	const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
				627
				628	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
				629	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
				630	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
				631	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
				632	float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
				633	float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
				634	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				635
				636	// Update gx and gy pointer
				637	gx_ptr_0 += input_stride;
				638	gy_ptr_0 += input_stride;
				639	gx_ptr_1 += input_stride;
				640	gy_ptr_1 += input_stride;
				641	}
				642
				643	// Calculate harris score
				644	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				645
				646	// Store score
				647	vst1q_f32(output, mc);
				648	}
				649
				650	inline void harris_score7x7_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				651	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				652	{
				653	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				654	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				655	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				656	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				657	const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
				658	const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
				659	const auto output = static_cast<float *__restrict>(output_ptr);
				660
				661	// Gx^2, Gy^2 and Gx*Gy
				662	float32x4_t gx2 = vdupq_n_f32(0.0f);
				663	float32x4_t gy2 = vdupq_n_f32(0.0f);
				664	float32x4_t gxgy = vdupq_n_f32(0.0f);
				665	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				666	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				667	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				668
				669	for(int i = 0; i < 7; ++i)
				670	{
				671	const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				672	const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				673	const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				674	const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				675	const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				676	const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				677	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				678
				679	// Update gx and gy pointer
				680	gx_ptr_0 += input_stride;
				681	gy_ptr_0 += input_stride;
				682	gx_ptr_1 += input_stride;
				683	gy_ptr_1 += input_stride;
				684	gx_ptr_2 += input_stride;
				685	gy_ptr_2 += input_stride;
				686	}
				687
				688	// Calculate harris score
				689	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				690
				691	// Store score
				692	vst1q_f32(output, mc);
				693	}
				694
				695	} // namespace
				696
				697	INEHarrisScoreKernel::INEHarrisScoreKernel()
				698	: _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
				699	{
				700	}
				701
				702	template <int32_t block_size>
				703	NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
				704	: INEHarrisScoreKernel(), _func(nullptr)
				705	{
				706	}
				707
				708	template <int32_t block_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	709	void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	710	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	711	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	712	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				713	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				714	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				715
				716	Iterator input1(_input1, window);
				717	Iterator input2(_input2, window);
				718	Iterator output(_output, window);
				719
				720	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				721
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	722	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	723	{
				724	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				725	},
				726	input1, input2, output);
				727	}
				728
				729	template <int32_t block_size>
				730	BorderSize NEHarrisScoreKernel<block_size>::border_size() const
				731	{
				732	return _border_size;
				733	}
				734
				735	template <int32_t block_size>
				736	void NEHarrisScoreKernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				737	bool border_undefined)
				738	{
				739	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				740	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				741	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				742	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				743	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				744	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				745	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				746	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				747
				748	_input1 = input1;
				749	_input2 = input2;
				750	_output = output;
				751	_sensitivity = sensitivity;
				752	_strength_thresh = strength_thresh;
				753	_norm_factor = norm_factor;
				754	_border_size = BorderSize(block_size / 2);
				755
				756	if(input1->info()->data_type() == DataType::S16)
				757	{
				758	switch(block_size)
				759	{
				760	case 3:
				761	_func = &harris_score3x3_S16_S16_FLOAT;
				762	break;
				763	case 5:
				764	_func = &harris_score5x5_S16_S16_FLOAT;
				765	break;
				766	case 7:
				767	_func = &harris_score7x7_S16_S16_FLOAT;
				768	break;
				769	default:
				770	ARM_COMPUTE_ERROR("Invalid block size");
				771	break;
				772	}
				773	}
				774	else
				775	{
				776	switch(block_size)
				777	{
				778	case 3:
				779	_func = &harris_score3x3_S32_S32_FLOAT;
				780	break;
				781	case 5:
				782	_func = &harris_score5x5_S32_S32_FLOAT;
				783	break;
				784	case 7:
				785	_func = &harris_score7x7_S32_S32_FLOAT;
				786	break;
				787	default:
				788	ARM_COMPUTE_ERROR("Invalid block size");
				789	break;
				790	}
				791	}
				792
				793	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				794
				795	constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
				796	constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
				797	constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
				798	constexpr unsigned int num_rows_read_per_iteration = block_size;
				799
				800	// Configure kernel window
				801	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				802	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				803
				804	update_window_and_padding(win,
				805	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				806	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				807	output_access);
				808
				809	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				810	input2->info()->valid_region());
				811
				812	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				813
				814	INEKernel::configure(win);
				815	}