blob: 4159e434b224dc3acb0c20487712825ac4d39126 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010034#include "src/core/helpers/AutoConfiguration.h"
35#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
37#include <algorithm>
38#include <arm_neon.h>
39#include <cmath>
40#include <cstddef>
41
42using namespace arm_compute;
43
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044template class arm_compute::NEHarrisScoreKernel<3>;
45template class arm_compute::NEHarrisScoreKernel<5>;
46template class arm_compute::NEHarrisScoreKernel<7>;
47template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
48template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
49template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
50
51namespace
52{
53inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
54{
55 // Trace^2
56 float32x4_t trace2 = vaddq_f32(gx2, gy2);
57 trace2 = vmulq_f32(trace2, trace2);
58
59 // Det(A)
60 float32x4_t det = vmulq_f32(gx2, gy2);
61 det = vmlsq_f32(det, gxgy, gxgy);
62
63 // Det(A) - sensitivity * trace^2
64 const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
65
66 // mc > strength_thresh
67 const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
68
69 return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
70}
71
72inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
73 float32x4_t norm_factor)
74{
75 // Normalize
76 low_gx = vmulq_f32(low_gx, norm_factor);
77 low_gy = vmulq_f32(low_gy, norm_factor);
78 high_gx = vmulq_f32(high_gx, norm_factor);
79 high_gy = vmulq_f32(high_gy, norm_factor);
80
81 const float32x4_t l_gx = low_gx;
82 const float32x4_t l_gy = low_gy;
83 const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
84 const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
85 const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
86 const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
87
88 // Gx*Gx
89 gx2 = vmlaq_f32(gx2, l_gx, l_gx);
90 gx2 = vmlaq_f32(gx2, m_gx, m_gx);
91 gx2 = vmlaq_f32(gx2, r_gx, r_gx);
92
93 // Gy*Gy
94 gy2 = vmlaq_f32(gy2, l_gy, l_gy);
95 gy2 = vmlaq_f32(gy2, m_gy, m_gy);
96 gy2 = vmlaq_f32(gy2, r_gy, r_gy);
97
98 // Gx*Gy
99 gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
100 gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
101 gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
102}
103
104inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
105 float32x4_t norm_factor)
106{
107 // Normalize
108 low_gx = vmulq_f32(low_gx, norm_factor);
109 low_gy = vmulq_f32(low_gy, norm_factor);
110 high_gx = vmulq_f32(high_gx, norm_factor);
111 high_gy = vmulq_f32(high_gy, norm_factor);
112
113 // L2 values
114 float32x4_t gx = low_gx;
115 float32x4_t gy = low_gy;
116
117 // Accumulate
118 gx2 = vmlaq_f32(gx2, gx, gx);
119 gy2 = vmlaq_f32(gy2, gy, gy);
120 gxgy = vmlaq_f32(gxgy, gx, gy);
121
122 // L1 values
123 gx = vextq_f32(low_gx, high_gx, 1);
124 gy = vextq_f32(low_gy, high_gy, 1);
125
126 // Accumulate
127 gx2 = vmlaq_f32(gx2, gx, gx);
128 gy2 = vmlaq_f32(gy2, gy, gy);
129 gxgy = vmlaq_f32(gxgy, gx, gy);
130
131 // M values
132 gx = vextq_f32(low_gx, high_gx, 2);
133 gy = vextq_f32(low_gy, high_gy, 2);
134
135 // Accumulate
136 gx2 = vmlaq_f32(gx2, gx, gx);
137 gy2 = vmlaq_f32(gy2, gy, gy);
138 gxgy = vmlaq_f32(gxgy, gx, gy);
139
140 // R1 values
141 gx = vextq_f32(low_gx, high_gx, 3);
142 gy = vextq_f32(low_gy, high_gy, 3);
143
144 // Accumulate
145 gx2 = vmlaq_f32(gx2, gx, gx);
146 gy2 = vmlaq_f32(gy2, gy, gy);
147 gxgy = vmlaq_f32(gxgy, gx, gy);
148
149 // R2 values
150 gx = high_gx;
151 gy = high_gy;
152
153 // Accumulate
154 gx2 = vmlaq_f32(gx2, gx, gx);
155 gy2 = vmlaq_f32(gy2, gy, gy);
156 gxgy = vmlaq_f32(gxgy, gx, gy);
157}
158
159inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
160 float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
161{
162 // Normalize
163 low_gx = vmulq_f32(low_gx, norm_factor);
164 low_gy = vmulq_f32(low_gy, norm_factor);
165 high_gx = vmulq_f32(high_gx, norm_factor);
166 high_gy = vmulq_f32(high_gy, norm_factor);
167
168 // L3 values
169 float32x4_t gx = low_gx;
170 float32x4_t gy = low_gy;
171
172 // Accumulate
173 gx2 = vmlaq_f32(gx2, gx, gx);
174 gy2 = vmlaq_f32(gy2, gy, gy);
175 gxgy = vmlaq_f32(gxgy, gx, gy);
176
177 // L2 values
178 gx = vextq_f32(low_gx, high_gx, 1);
179 gy = vextq_f32(low_gy, high_gy, 1);
180
181 // Accumulate
182 gx2 = vmlaq_f32(gx2, gx, gx);
183 gy2 = vmlaq_f32(gy2, gy, gy);
184 gxgy = vmlaq_f32(gxgy, gx, gy);
185
186 // L1 values
187 gx = vextq_f32(low_gx, high_gx, 2);
188 gy = vextq_f32(low_gy, high_gy, 2);
189
190 // Accumulate
191 gx2 = vmlaq_f32(gx2, gx, gx);
192 gy2 = vmlaq_f32(gy2, gy, gy);
193 gxgy = vmlaq_f32(gxgy, gx, gy);
194
195 // M values
196 gx = vextq_f32(low_gx, high_gx, 3);
197 gy = vextq_f32(low_gy, high_gy, 3);
198
199 // Accumulate
200 gx2 = vmlaq_f32(gx2, gx, gx);
201 gy2 = vmlaq_f32(gy2, gy, gy);
202 gxgy = vmlaq_f32(gxgy, gx, gy);
203
204 // R1 values
205 gx = high_gx;
206 gy = high_gy;
207
208 // Accumulate
209 gx2 = vmlaq_f32(gx2, gx, gx);
210 gy2 = vmlaq_f32(gy2, gy, gy);
211 gxgy = vmlaq_f32(gxgy, gx, gy);
212
213 // Change tmp_low and tmp_high for calculating R2 and R3 values
214 low_gx = high_gx;
215 low_gy = high_gy;
216 high_gx = high_gx1;
217 high_gy = high_gy1;
218
219 // Normalize
220 high_gx = vmulq_f32(high_gx, norm_factor);
221 high_gy = vmulq_f32(high_gy, norm_factor);
222
223 // R2 values
224 gx = vextq_f32(low_gx, high_gx, 1);
225 gy = vextq_f32(low_gy, high_gy, 1);
226
227 // Accumulate
228 gx2 = vmlaq_f32(gx2, gx, gx);
229 gy2 = vmlaq_f32(gy2, gy, gy);
230 gxgy = vmlaq_f32(gxgy, gx, gy);
231
232 // R3 values
233 gx = vextq_f32(low_gx, high_gx, 2);
234 gy = vextq_f32(low_gy, high_gy, 2);
235
236 // Accumulate
237 gx2 = vmlaq_f32(gx2, gx, gx);
238 gy2 = vmlaq_f32(gy2, gy, gy);
239 gxgy = vmlaq_f32(gxgy, gx, gy);
240}
241
242inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
243 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
244
245{
246 const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
247 const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
248 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
249 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
250 const auto output = static_cast<float *__restrict>(output_ptr);
251
252 // Gx^2, Gy^2 and Gx*Gy
253 float32x4x2_t gx2 =
254 {
255 {
256 vdupq_n_f32(0.0f),
257 vdupq_n_f32(0.0f)
258 }
259 };
260 float32x4x2_t gy2 =
261 {
262 {
263 vdupq_n_f32(0.0f),
264 vdupq_n_f32(0.0f)
265 }
266 };
267 float32x4x2_t gxgy =
268 {
269 {
270 vdupq_n_f32(0.0f),
271 vdupq_n_f32(0.0f)
272 }
273 };
274
275 // Row0
276 int16x8x2_t tmp_gx =
277 {
278 {
279 vld1q_s16(gx_ptr_0 - input_stride),
280 vld1q_s16(gx_ptr_1 - input_stride)
281 }
282 };
283 int16x8x2_t tmp_gy =
284 {
285 {
286 vld1q_s16(gy_ptr_0 - input_stride),
287 vld1q_s16(gy_ptr_1 - input_stride)
288 }
289 };
290 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
291 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
292 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
293
294 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
295 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
296 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
297 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
298 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
299
300 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
301 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
302 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
303 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
304 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
305
306 // Row1
307 tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
308 tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
309 tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
310 tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
311
312 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
313 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
314 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
315 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
316 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
317
318 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
319 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
320 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
321 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
322 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
323
324 // Row2
325 tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
326 tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
327 tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
328 tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
329
330 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
331 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
332 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
333 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
334 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
335
336 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
337 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
338 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
339 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
340 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
341
342 // Calculate harris score
343 const float32x4x2_t mc =
344 {
345 {
346 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
347 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
348 }
349 };
350
351 // Store score
352 vst1q_f32(output + 0, mc.val[0]);
353 vst1q_f32(output + 4, mc.val[1]);
354}
355
356inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
357 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
358{
359 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
360 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
361 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
362 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
363 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
364 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
365 const auto output = static_cast<float *__restrict>(output_ptr);
366 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
367 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
368 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
369
370 // Gx^2, Gy^2 and Gx*Gy
371 float32x4x2_t gx2 =
372 {
373 {
374 vdupq_n_f32(0.0f),
375 vdupq_n_f32(0.0f)
376 }
377 };
378 float32x4x2_t gy2 =
379 {
380 {
381 vdupq_n_f32(0.0f),
382 vdupq_n_f32(0.0f)
383 }
384 };
385 float32x4x2_t gxgy =
386 {
387 {
388 vdupq_n_f32(0.0f),
389 vdupq_n_f32(0.0f)
390 }
391 };
392
393 // Row0
394 float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
395 float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
396 float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
397 float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
398 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
399
400 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
401 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
402 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
403 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
404 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
405
406 // Row1
407 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
408 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
409 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
410 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
411 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
412
413 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
414 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
415 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
416 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
417 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
418
419 // Row2
420 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
421 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
422 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
423 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
424 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
425
426 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
427 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
428 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
429 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
430 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
431
432 // Calculate harris score
433 const float32x4x2_t mc =
434 {
435 {
436 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
437 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
438 }
439 };
440
441 // Store score
442 vst1q_f32(output + 0, mc.val[0]);
443 vst1q_f32(output + 4, mc.val[1]);
444}
445
446inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
447 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
448{
449 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
450 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
451 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
452 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
453 const auto output = static_cast<float *__restrict>(output_ptr);
454
455 // Gx^2, Gy^2 and Gx*Gy
456 float32x4x2_t gx2 =
457 {
458 {
459 vdupq_n_f32(0.0f),
460 vdupq_n_f32(0.0f)
461 }
462 };
463 float32x4x2_t gy2 =
464 {
465 {
466 vdupq_n_f32(0.0f),
467 vdupq_n_f32(0.0f)
468 }
469 };
470 float32x4x2_t gxgy =
471 {
472 {
473 vdupq_n_f32(0.0f),
474 vdupq_n_f32(0.0f)
475 }
476 };
477 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
478 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
479 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
480
481 for(int i = 0; i < 5; ++i)
482 {
483 const int16x8x2_t tmp_gx =
484 {
485 {
486 vld1q_s16(gx_ptr_0),
487 vld1q_s16(gx_ptr_1)
488 }
489 };
490 const int16x8x2_t tmp_gy =
491 {
492 {
493 vld1q_s16(gy_ptr_0),
494 vld1q_s16(gy_ptr_1)
495 }
496 };
497
498 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
499 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
500 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
501 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
502 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
503
504 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
505 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
506 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
507 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
508 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
509
510 // Update gx and gy pointer
511 gx_ptr_0 += input_stride;
512 gy_ptr_0 += input_stride;
513 gx_ptr_1 += input_stride;
514 gy_ptr_1 += input_stride;
515 }
516
517 // Calculate harris score
518 const float32x4x2_t mc =
519 {
520 {
521 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
522 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
523 }
524 };
525
526 // Store score
527 vst1q_f32(output + 0, mc.val[0]);
528 vst1q_f32(output + 4, mc.val[1]);
529}
530
531inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
532 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
533
534{
535 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
536 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
537 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
538 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
539 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
540 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
541 const auto output = static_cast<float *__restrict>(output_ptr);
542
543 // Gx^2, Gy^2 and Gx*Gy
544 float32x4x2_t gx2 =
545 {
546 {
547 vdupq_n_f32(0.0f),
548 vdupq_n_f32(0.0f)
549 }
550 };
551 float32x4x2_t gy2 =
552 {
553 {
554 vdupq_n_f32(0.0f),
555 vdupq_n_f32(0.0f)
556 }
557 };
558 float32x4x2_t gxgy =
559 {
560 {
561 vdupq_n_f32(0.0f),
562 vdupq_n_f32(0.0f)
563 }
564 };
565 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
566 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
567 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
568
569 for(int i = 0; i < 5; ++i)
570 {
571 const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
572 const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
573 const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
574 const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
575 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
576
577 const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
578 const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
579 const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
580 const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
581 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
582
583 // Update gx and gy pointer
584 gx_ptr_0 += input_stride;
585 gy_ptr_0 += input_stride;
586 gx_ptr_1 += input_stride;
587 gy_ptr_1 += input_stride;
588 gx_ptr_2 += input_stride;
589 gy_ptr_2 += input_stride;
590 }
591
592 // Calculate harris score
593 const float32x4x2_t mc =
594 {
595 {
596 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
597 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
598 }
599 };
600
601 // Store score
602 vst1q_f32(output + 0, mc.val[0]);
603 vst1q_f32(output + 4, mc.val[1]);
604}
605
606inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
607 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
608{
609 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
610 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
611 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
612 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
613 const auto output = static_cast<float *__restrict>(output_ptr);
614
615 // Gx^2, Gy^2 and Gx*Gy
616 float32x4_t gx2 = vdupq_n_f32(0.0f);
617 float32x4_t gy2 = vdupq_n_f32(0.0f);
618 float32x4_t gxgy = vdupq_n_f32(0.0f);
619 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
620 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
621 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
622
623 for(int i = 0; i < 7; ++i)
624 {
625 const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
626 const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
627 const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
628 const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
629
630 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
631 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
632 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
633 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
634 float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
635 float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
636 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
637
638 // Update gx and gy pointer
639 gx_ptr_0 += input_stride;
640 gy_ptr_0 += input_stride;
641 gx_ptr_1 += input_stride;
642 gy_ptr_1 += input_stride;
643 }
644
645 // Calculate harris score
646 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
647
648 // Store score
649 vst1q_f32(output, mc);
650}
651
652inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
653 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
654{
655 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
656 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
657 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
658 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
659 const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
660 const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
661 const auto output = static_cast<float *__restrict>(output_ptr);
662
663 // Gx^2, Gy^2 and Gx*Gy
664 float32x4_t gx2 = vdupq_n_f32(0.0f);
665 float32x4_t gy2 = vdupq_n_f32(0.0f);
666 float32x4_t gxgy = vdupq_n_f32(0.0f);
667 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
668 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
669 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
670
671 for(int i = 0; i < 7; ++i)
672 {
673 const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
674 const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
675 const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
676 const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
677 const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
678 const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
679 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
680
681 // Update gx and gy pointer
682 gx_ptr_0 += input_stride;
683 gy_ptr_0 += input_stride;
684 gx_ptr_1 += input_stride;
685 gy_ptr_1 += input_stride;
686 gx_ptr_2 += input_stride;
687 gy_ptr_2 += input_stride;
688 }
689
690 // Calculate harris score
691 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
692
693 // Store score
694 vst1q_f32(output, mc);
695}
696
697} // namespace
698
699INEHarrisScoreKernel::INEHarrisScoreKernel()
700 : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
701{
702}
703
704template <int32_t block_size>
705NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
706 : INEHarrisScoreKernel(), _func(nullptr)
707{
708}
709
710template <int32_t block_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100711void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100712{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100713 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100714 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
715 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
716 ARM_COMPUTE_ERROR_ON(_func == nullptr);
717
718 Iterator input1(_input1, window);
719 Iterator input2(_input2, window);
720 Iterator output(_output, window);
721
722 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
723
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100724 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100725 {
726 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
727 },
728 input1, input2, output);
729}
730
731template <int32_t block_size>
732BorderSize NEHarrisScoreKernel<block_size>::border_size() const
733{
734 return _border_size;
735}
736
737template <int32_t block_size>
738void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
739 bool border_undefined)
740{
741 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
742 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
743 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
744 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
745 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
746 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
747 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
748 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
749
750 _input1 = input1;
751 _input2 = input2;
752 _output = output;
753 _sensitivity = sensitivity;
754 _strength_thresh = strength_thresh;
755 _norm_factor = norm_factor;
756 _border_size = BorderSize(block_size / 2);
757
758 if(input1->info()->data_type() == DataType::S16)
759 {
760 switch(block_size)
761 {
762 case 3:
763 _func = &harris_score3x3_S16_S16_FLOAT;
764 break;
765 case 5:
766 _func = &harris_score5x5_S16_S16_FLOAT;
767 break;
768 case 7:
769 _func = &harris_score7x7_S16_S16_FLOAT;
770 break;
771 default:
772 ARM_COMPUTE_ERROR("Invalid block size");
773 break;
774 }
775 }
776 else
777 {
778 switch(block_size)
779 {
780 case 3:
781 _func = &harris_score3x3_S32_S32_FLOAT;
782 break;
783 case 5:
784 _func = &harris_score5x5_S32_S32_FLOAT;
785 break;
786 case 7:
787 _func = &harris_score7x7_S32_S32_FLOAT;
788 break;
789 default:
790 ARM_COMPUTE_ERROR("Invalid block size");
791 break;
792 }
793 }
794
795 ARM_COMPUTE_ERROR_ON(nullptr == _func);
796
797 constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
798 constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
799 constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
800 constexpr unsigned int num_rows_read_per_iteration = block_size;
801
802 // Configure kernel window
803 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
804 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
805
806 update_window_and_padding(win,
807 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
808 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
809 output_access);
810
811 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
812 input2->info()->valid_region());
813
814 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
815
816 INEKernel::configure(win);
817}