Blame - src/core/NEON/kernels/NEHarrisCornersKernel.cpp - ml/ComputeLibrary

blob: 4159e434b224dc3acb0c20487712825ac4d39126 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	2	* Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame]	24	#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Coordinates.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/core/Window.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	34	#include "src/core/helpers/AutoConfiguration.h"
				35	#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	36
				37	#include <algorithm>
				38	#include <arm_neon.h>
				39	#include <cmath>
				40	#include <cstddef>
				41
				42	using namespace arm_compute;
				43
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	44	template class arm_compute::NEHarrisScoreKernel<3>;
				45	template class arm_compute::NEHarrisScoreKernel<5>;
				46	template class arm_compute::NEHarrisScoreKernel<7>;
				47	template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
				48	template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
				49	template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
				50
				51	namespace
				52	{
				53	inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
				54	{
				55	// Trace^2
				56	float32x4_t trace2 = vaddq_f32(gx2, gy2);
				57	trace2 = vmulq_f32(trace2, trace2);
				58
				59	// Det(A)
				60	float32x4_t det = vmulq_f32(gx2, gy2);
				61	det = vmlsq_f32(det, gxgy, gxgy);
				62
				63	// Det(A) - sensitivity * trace^2
				64	const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
				65
				66	// mc > strength_thresh
				67	const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
				68
				69	return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
				70	}
				71
				72	inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				73	float32x4_t norm_factor)
				74	{
				75	// Normalize
				76	low_gx = vmulq_f32(low_gx, norm_factor);
				77	low_gy = vmulq_f32(low_gy, norm_factor);
				78	high_gx = vmulq_f32(high_gx, norm_factor);
				79	high_gy = vmulq_f32(high_gy, norm_factor);
				80
				81	const float32x4_t l_gx = low_gx;
				82	const float32x4_t l_gy = low_gy;
				83	const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
				84	const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
				85	const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
				86	const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
				87
				88	// Gx*Gx
				89	gx2 = vmlaq_f32(gx2, l_gx, l_gx);
				90	gx2 = vmlaq_f32(gx2, m_gx, m_gx);
				91	gx2 = vmlaq_f32(gx2, r_gx, r_gx);
				92
				93	// Gy*Gy
				94	gy2 = vmlaq_f32(gy2, l_gy, l_gy);
				95	gy2 = vmlaq_f32(gy2, m_gy, m_gy);
				96	gy2 = vmlaq_f32(gy2, r_gy, r_gy);
				97
				98	// Gx*Gy
				99	gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
				100	gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
				101	gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
				102	}
				103
				104	inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
				105	float32x4_t norm_factor)
				106	{
				107	// Normalize
				108	low_gx = vmulq_f32(low_gx, norm_factor);
				109	low_gy = vmulq_f32(low_gy, norm_factor);
				110	high_gx = vmulq_f32(high_gx, norm_factor);
				111	high_gy = vmulq_f32(high_gy, norm_factor);
				112
				113	// L2 values
				114	float32x4_t gx = low_gx;
				115	float32x4_t gy = low_gy;
				116
				117	// Accumulate
				118	gx2 = vmlaq_f32(gx2, gx, gx);
				119	gy2 = vmlaq_f32(gy2, gy, gy);
				120	gxgy = vmlaq_f32(gxgy, gx, gy);
				121
				122	// L1 values
				123	gx = vextq_f32(low_gx, high_gx, 1);
				124	gy = vextq_f32(low_gy, high_gy, 1);
				125
				126	// Accumulate
				127	gx2 = vmlaq_f32(gx2, gx, gx);
				128	gy2 = vmlaq_f32(gy2, gy, gy);
				129	gxgy = vmlaq_f32(gxgy, gx, gy);
				130
				131	// M values
				132	gx = vextq_f32(low_gx, high_gx, 2);
				133	gy = vextq_f32(low_gy, high_gy, 2);
				134
				135	// Accumulate
				136	gx2 = vmlaq_f32(gx2, gx, gx);
				137	gy2 = vmlaq_f32(gy2, gy, gy);
				138	gxgy = vmlaq_f32(gxgy, gx, gy);
				139
				140	// R1 values
				141	gx = vextq_f32(low_gx, high_gx, 3);
				142	gy = vextq_f32(low_gy, high_gy, 3);
				143
				144	// Accumulate
				145	gx2 = vmlaq_f32(gx2, gx, gx);
				146	gy2 = vmlaq_f32(gy2, gy, gy);
				147	gxgy = vmlaq_f32(gxgy, gx, gy);
				148
				149	// R2 values
				150	gx = high_gx;
				151	gy = high_gy;
				152
				153	// Accumulate
				154	gx2 = vmlaq_f32(gx2, gx, gx);
				155	gy2 = vmlaq_f32(gy2, gy, gy);
				156	gxgy = vmlaq_f32(gxgy, gx, gy);
				157	}
				158
				159	inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
				160	float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
				161	{
				162	// Normalize
				163	low_gx = vmulq_f32(low_gx, norm_factor);
				164	low_gy = vmulq_f32(low_gy, norm_factor);
				165	high_gx = vmulq_f32(high_gx, norm_factor);
				166	high_gy = vmulq_f32(high_gy, norm_factor);
				167
				168	// L3 values
				169	float32x4_t gx = low_gx;
				170	float32x4_t gy = low_gy;
				171
				172	// Accumulate
				173	gx2 = vmlaq_f32(gx2, gx, gx);
				174	gy2 = vmlaq_f32(gy2, gy, gy);
				175	gxgy = vmlaq_f32(gxgy, gx, gy);
				176
				177	// L2 values
				178	gx = vextq_f32(low_gx, high_gx, 1);
				179	gy = vextq_f32(low_gy, high_gy, 1);
				180
				181	// Accumulate
				182	gx2 = vmlaq_f32(gx2, gx, gx);
				183	gy2 = vmlaq_f32(gy2, gy, gy);
				184	gxgy = vmlaq_f32(gxgy, gx, gy);
				185
				186	// L1 values
				187	gx = vextq_f32(low_gx, high_gx, 2);
				188	gy = vextq_f32(low_gy, high_gy, 2);
				189
				190	// Accumulate
				191	gx2 = vmlaq_f32(gx2, gx, gx);
				192	gy2 = vmlaq_f32(gy2, gy, gy);
				193	gxgy = vmlaq_f32(gxgy, gx, gy);
				194
				195	// M values
				196	gx = vextq_f32(low_gx, high_gx, 3);
				197	gy = vextq_f32(low_gy, high_gy, 3);
				198
				199	// Accumulate
				200	gx2 = vmlaq_f32(gx2, gx, gx);
				201	gy2 = vmlaq_f32(gy2, gy, gy);
				202	gxgy = vmlaq_f32(gxgy, gx, gy);
				203
				204	// R1 values
				205	gx = high_gx;
				206	gy = high_gy;
				207
				208	// Accumulate
				209	gx2 = vmlaq_f32(gx2, gx, gx);
				210	gy2 = vmlaq_f32(gy2, gy, gy);
				211	gxgy = vmlaq_f32(gxgy, gx, gy);
				212
				213	// Change tmp_low and tmp_high for calculating R2 and R3 values
				214	low_gx = high_gx;
				215	low_gy = high_gy;
				216	high_gx = high_gx1;
				217	high_gy = high_gy1;
				218
				219	// Normalize
				220	high_gx = vmulq_f32(high_gx, norm_factor);
				221	high_gy = vmulq_f32(high_gy, norm_factor);
				222
				223	// R2 values
				224	gx = vextq_f32(low_gx, high_gx, 1);
				225	gy = vextq_f32(low_gy, high_gy, 1);
				226
				227	// Accumulate
				228	gx2 = vmlaq_f32(gx2, gx, gx);
				229	gy2 = vmlaq_f32(gy2, gy, gy);
				230	gxgy = vmlaq_f32(gxgy, gx, gy);
				231
				232	// R3 values
				233	gx = vextq_f32(low_gx, high_gx, 2);
				234	gy = vextq_f32(low_gy, high_gy, 2);
				235
				236	// Accumulate
				237	gx2 = vmlaq_f32(gx2, gx, gx);
				238	gy2 = vmlaq_f32(gy2, gy, gy);
				239	gxgy = vmlaq_f32(gxgy, gx, gy);
				240	}
				241
				242	inline void harris_score3x3_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				243	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				244
				245	{
				246	const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
				247	const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
				248	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				249	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				250	const auto output = static_cast<float *__restrict>(output_ptr);
				251
				252	// Gx^2, Gy^2 and Gx*Gy
				253	float32x4x2_t gx2 =
				254	{
				255	{
				256	vdupq_n_f32(0.0f),
				257	vdupq_n_f32(0.0f)
				258	}
				259	};
				260	float32x4x2_t gy2 =
				261	{
				262	{
				263	vdupq_n_f32(0.0f),
				264	vdupq_n_f32(0.0f)
				265	}
				266	};
				267	float32x4x2_t gxgy =
				268	{
				269	{
				270	vdupq_n_f32(0.0f),
				271	vdupq_n_f32(0.0f)
				272	}
				273	};
				274
				275	// Row0
				276	int16x8x2_t tmp_gx =
				277	{
				278	{
				279	vld1q_s16(gx_ptr_0 - input_stride),
				280	vld1q_s16(gx_ptr_1 - input_stride)
				281	}
				282	};
				283	int16x8x2_t tmp_gy =
				284	{
				285	{
				286	vld1q_s16(gy_ptr_0 - input_stride),
				287	vld1q_s16(gy_ptr_1 - input_stride)
				288	}
				289	};
				290	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				291	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				292	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				293
				294	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				295	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				296	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				297	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				298	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				299
				300	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				301	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				302	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				303	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				304	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				305
				306	// Row1
				307	tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
				308	tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
				309	tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
				310	tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
				311
				312	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				313	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				314	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				315	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				316	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				317
				318	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				319	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				320	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				321	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				322	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				323
				324	// Row2
				325	tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
				326	tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
				327	tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
				328	tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
				329
				330	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				331	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				332	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				333	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				334	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				335
				336	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				337	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				338	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				339	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				340	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				341
				342	// Calculate harris score
				343	const float32x4x2_t mc =
				344	{
				345	{
				346	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				347	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				348	}
				349	};
				350
				351	// Store score
				352	vst1q_f32(output + 0, mc.val[0]);
				353	vst1q_f32(output + 4, mc.val[1]);
				354	}
				355
				356	inline void harris_score3x3_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				357	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				358	{
				359	auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
				360	auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
				361	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				362	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				363	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				364	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				365	const auto output = static_cast<float *__restrict>(output_ptr);
				366	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				367	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				368	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				369
				370	// Gx^2, Gy^2 and Gx*Gy
				371	float32x4x2_t gx2 =
				372	{
				373	{
				374	vdupq_n_f32(0.0f),
				375	vdupq_n_f32(0.0f)
				376	}
				377	};
				378	float32x4x2_t gy2 =
				379	{
				380	{
				381	vdupq_n_f32(0.0f),
				382	vdupq_n_f32(0.0f)
				383	}
				384	};
				385	float32x4x2_t gxgy =
				386	{
				387	{
				388	vdupq_n_f32(0.0f),
				389	vdupq_n_f32(0.0f)
				390	}
				391	};
				392
				393	// Row0
				394	float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
				395	float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
				396	float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				397	float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				398	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				399
				400	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
				401	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
				402	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
				403	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
				404	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				405
				406	// Row1
				407	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				408	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				409	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				410	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				411	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				412
				413	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				414	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				415	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				416	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				417	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				418
				419	// Row2
				420	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
				421	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
				422	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				423	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				424	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				425
				426	low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
				427	low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
				428	high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
				429	high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
				430	harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				431
				432	// Calculate harris score
				433	const float32x4x2_t mc =
				434	{
				435	{
				436	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				437	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				438	}
				439	};
				440
				441	// Store score
				442	vst1q_f32(output + 0, mc.val[0]);
				443	vst1q_f32(output + 4, mc.val[1]);
				444	}
				445
				446	inline void harris_score5x5_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				447	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				448	{
				449	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				450	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				451	const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
				452	const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
				453	const auto output = static_cast<float *__restrict>(output_ptr);
				454
				455	// Gx^2, Gy^2 and Gx*Gy
				456	float32x4x2_t gx2 =
				457	{
				458	{
				459	vdupq_n_f32(0.0f),
				460	vdupq_n_f32(0.0f)
				461	}
				462	};
				463	float32x4x2_t gy2 =
				464	{
				465	{
				466	vdupq_n_f32(0.0f),
				467	vdupq_n_f32(0.0f)
				468	}
				469	};
				470	float32x4x2_t gxgy =
				471	{
				472	{
				473	vdupq_n_f32(0.0f),
				474	vdupq_n_f32(0.0f)
				475	}
				476	};
				477	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				478	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				479	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				480
				481	for(int i = 0; i < 5; ++i)
				482	{
				483	const int16x8x2_t tmp_gx =
				484	{
				485	{
				486	vld1q_s16(gx_ptr_0),
				487	vld1q_s16(gx_ptr_1)
				488	}
				489	};
				490	const int16x8x2_t tmp_gy =
				491	{
				492	{
				493	vld1q_s16(gy_ptr_0),
				494	vld1q_s16(gy_ptr_1)
				495	}
				496	};
				497
				498	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
				499	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
				500	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
				501	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
				502	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				503
				504	low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
				505	low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
				506	high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
				507	high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
				508	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				509
				510	// Update gx and gy pointer
				511	gx_ptr_0 += input_stride;
				512	gy_ptr_0 += input_stride;
				513	gx_ptr_1 += input_stride;
				514	gy_ptr_1 += input_stride;
				515	}
				516
				517	// Calculate harris score
				518	const float32x4x2_t mc =
				519	{
				520	{
				521	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				522	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				523	}
				524	};
				525
				526	// Store score
				527	vst1q_f32(output + 0, mc.val[0]);
				528	vst1q_f32(output + 4, mc.val[1]);
				529	}
				530
				531	inline void harris_score5x5_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				532	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				533
				534	{
				535	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 2 - 2 input_stride;
				536	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 2 - 2 input_stride;
				537	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				538	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				539	const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
				540	const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
				541	const auto output = static_cast<float *__restrict>(output_ptr);
				542
				543	// Gx^2, Gy^2 and Gx*Gy
				544	float32x4x2_t gx2 =
				545	{
				546	{
				547	vdupq_n_f32(0.0f),
				548	vdupq_n_f32(0.0f)
				549	}
				550	};
				551	float32x4x2_t gy2 =
				552	{
				553	{
				554	vdupq_n_f32(0.0f),
				555	vdupq_n_f32(0.0f)
				556	}
				557	};
				558	float32x4x2_t gxgy =
				559	{
				560	{
				561	vdupq_n_f32(0.0f),
				562	vdupq_n_f32(0.0f)
				563	}
				564	};
				565	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				566	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				567	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				568
				569	for(int i = 0; i < 5; ++i)
				570	{
				571	const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				572	const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				573	const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				574	const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				575	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
				576
				577	const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				578	const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				579	const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				580	const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				581	harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
				582
				583	// Update gx and gy pointer
				584	gx_ptr_0 += input_stride;
				585	gy_ptr_0 += input_stride;
				586	gx_ptr_1 += input_stride;
				587	gy_ptr_1 += input_stride;
				588	gx_ptr_2 += input_stride;
				589	gy_ptr_2 += input_stride;
				590	}
				591
				592	// Calculate harris score
				593	const float32x4x2_t mc =
				594	{
				595	{
				596	harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
				597	harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
				598	}
				599	};
				600
				601	// Store score
				602	vst1q_f32(output + 0, mc.val[0]);
				603	vst1q_f32(output + 4, mc.val[1]);
				604	}
				605
				606	inline void harris_score7x7_S16_S16_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				607	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				608	{
				609	auto gx_ptr_0 = static_cast<const int16_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				610	auto gy_ptr_0 = static_cast<const int16_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				611	const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
				612	const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
				613	const auto output = static_cast<float *__restrict>(output_ptr);
				614
				615	// Gx^2, Gy^2 and Gx*Gy
				616	float32x4_t gx2 = vdupq_n_f32(0.0f);
				617	float32x4_t gy2 = vdupq_n_f32(0.0f);
				618	float32x4_t gxgy = vdupq_n_f32(0.0f);
				619	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				620	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				621	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				622
				623	for(int i = 0; i < 7; ++i)
				624	{
				625	const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
				626	const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
				627	const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
				628	const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
				629
				630	float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
				631	float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
				632	float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
				633	float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
				634	float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
				635	float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
				636	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				637
				638	// Update gx and gy pointer
				639	gx_ptr_0 += input_stride;
				640	gy_ptr_0 += input_stride;
				641	gx_ptr_1 += input_stride;
				642	gy_ptr_1 += input_stride;
				643	}
				644
				645	// Calculate harris score
				646	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				647
				648	// Store score
				649	vst1q_f32(output, mc);
				650	}
				651
				652	inline void harris_score7x7_S32_S32_FLOAT(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
				653	float in_norm_factor, float in_sensitivity, float in_strength_thresh)
				654	{
				655	auto gx_ptr_0 = static_cast<const int32_t __restrict>(input1_ptr) - 3 - 3 input_stride;
				656	auto gy_ptr_0 = static_cast<const int32_t __restrict>(input2_ptr) - 3 - 3 input_stride;
				657	const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
				658	const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
				659	const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
				660	const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
				661	const auto output = static_cast<float *__restrict>(output_ptr);
				662
				663	// Gx^2, Gy^2 and Gx*Gy
				664	float32x4_t gx2 = vdupq_n_f32(0.0f);
				665	float32x4_t gy2 = vdupq_n_f32(0.0f);
				666	float32x4_t gxgy = vdupq_n_f32(0.0f);
				667	float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
				668	float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
				669	float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
				670
				671	for(int i = 0; i < 7; ++i)
				672	{
				673	const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
				674	const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
				675	const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
				676	const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
				677	const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
				678	const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
				679	harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
				680
				681	// Update gx and gy pointer
				682	gx_ptr_0 += input_stride;
				683	gy_ptr_0 += input_stride;
				684	gx_ptr_1 += input_stride;
				685	gy_ptr_1 += input_stride;
				686	gx_ptr_2 += input_stride;
				687	gy_ptr_2 += input_stride;
				688	}
				689
				690	// Calculate harris score
				691	const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
				692
				693	// Store score
				694	vst1q_f32(output, mc);
				695	}
				696
				697	} // namespace
				698
				699	INEHarrisScoreKernel::INEHarrisScoreKernel()
				700	: _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
				701	{
				702	}
				703
				704	template <int32_t block_size>
				705	NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
				706	: INEHarrisScoreKernel(), _func(nullptr)
				707	{
				708	}
				709
				710	template <int32_t block_size>
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	711	void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	712	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	713	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	714	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				715	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				716	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				717
				718	Iterator input1(_input1, window);
				719	Iterator input2(_input2, window);
				720	Iterator output(_output, window);
				721
				722	const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
				723
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	724	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	725	{
				726	(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
				727	},
				728	input1, input2, output);
				729	}
				730
				731	template <int32_t block_size>
				732	BorderSize NEHarrisScoreKernel<block_size>::border_size() const
				733	{
				734	return _border_size;
				735	}
				736
				737	template <int32_t block_size>
				738	void NEHarrisScoreKernel<block_size>::configure(const IImage input1, const IImage input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
				739	bool border_undefined)
				740	{
				741	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
				742	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
				743	ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
				744	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
				745	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
				746	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
				747	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
				748	ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
				749
				750	_input1 = input1;
				751	_input2 = input2;
				752	_output = output;
				753	_sensitivity = sensitivity;
				754	_strength_thresh = strength_thresh;
				755	_norm_factor = norm_factor;
				756	_border_size = BorderSize(block_size / 2);
				757
				758	if(input1->info()->data_type() == DataType::S16)
				759	{
				760	switch(block_size)
				761	{
				762	case 3:
				763	_func = &harris_score3x3_S16_S16_FLOAT;
				764	break;
				765	case 5:
				766	_func = &harris_score5x5_S16_S16_FLOAT;
				767	break;
				768	case 7:
				769	_func = &harris_score7x7_S16_S16_FLOAT;
				770	break;
				771	default:
				772	ARM_COMPUTE_ERROR("Invalid block size");
				773	break;
				774	}
				775	}
				776	else
				777	{
				778	switch(block_size)
				779	{
				780	case 3:
				781	_func = &harris_score3x3_S32_S32_FLOAT;
				782	break;
				783	case 5:
				784	_func = &harris_score5x5_S32_S32_FLOAT;
				785	break;
				786	case 7:
				787	_func = &harris_score7x7_S32_S32_FLOAT;
				788	break;
				789	default:
				790	ARM_COMPUTE_ERROR("Invalid block size");
				791	break;
				792	}
				793	}
				794
				795	ARM_COMPUTE_ERROR_ON(nullptr == _func);
				796
				797	constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
				798	constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
				799	constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
				800	constexpr unsigned int num_rows_read_per_iteration = block_size;
				801
				802	// Configure kernel window
				803	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				804	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				805
				806	update_window_and_padding(win,
				807	AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				808	AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				809	output_access);
				810
				811	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				812	input2->info()->valid_region());
				813
				814	output_access.set_valid_region(win, valid_region, border_undefined, border_size());
				815
				816	INEKernel::configure(win);
				817	}