blob: 34e68e71cbf717c05ae662930b9d922336dd7ad1 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michalis Spyroua4f378d2019-04-26 14:54:54 +01002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cmath>
38#include <cstddef>
39
40using namespace arm_compute;
41
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042template class arm_compute::NEHarrisScoreKernel<3>;
43template class arm_compute::NEHarrisScoreKernel<5>;
44template class arm_compute::NEHarrisScoreKernel<7>;
45template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
46template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
47template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
48
49namespace
50{
51inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
52{
53 // Trace^2
54 float32x4_t trace2 = vaddq_f32(gx2, gy2);
55 trace2 = vmulq_f32(trace2, trace2);
56
57 // Det(A)
58 float32x4_t det = vmulq_f32(gx2, gy2);
59 det = vmlsq_f32(det, gxgy, gxgy);
60
61 // Det(A) - sensitivity * trace^2
62 const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
63
64 // mc > strength_thresh
65 const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
66
67 return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
68}
69
70inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
71 float32x4_t norm_factor)
72{
73 // Normalize
74 low_gx = vmulq_f32(low_gx, norm_factor);
75 low_gy = vmulq_f32(low_gy, norm_factor);
76 high_gx = vmulq_f32(high_gx, norm_factor);
77 high_gy = vmulq_f32(high_gy, norm_factor);
78
79 const float32x4_t l_gx = low_gx;
80 const float32x4_t l_gy = low_gy;
81 const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
82 const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
83 const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
84 const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
85
86 // Gx*Gx
87 gx2 = vmlaq_f32(gx2, l_gx, l_gx);
88 gx2 = vmlaq_f32(gx2, m_gx, m_gx);
89 gx2 = vmlaq_f32(gx2, r_gx, r_gx);
90
91 // Gy*Gy
92 gy2 = vmlaq_f32(gy2, l_gy, l_gy);
93 gy2 = vmlaq_f32(gy2, m_gy, m_gy);
94 gy2 = vmlaq_f32(gy2, r_gy, r_gy);
95
96 // Gx*Gy
97 gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
98 gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
99 gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
100}
101
102inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
103 float32x4_t norm_factor)
104{
105 // Normalize
106 low_gx = vmulq_f32(low_gx, norm_factor);
107 low_gy = vmulq_f32(low_gy, norm_factor);
108 high_gx = vmulq_f32(high_gx, norm_factor);
109 high_gy = vmulq_f32(high_gy, norm_factor);
110
111 // L2 values
112 float32x4_t gx = low_gx;
113 float32x4_t gy = low_gy;
114
115 // Accumulate
116 gx2 = vmlaq_f32(gx2, gx, gx);
117 gy2 = vmlaq_f32(gy2, gy, gy);
118 gxgy = vmlaq_f32(gxgy, gx, gy);
119
120 // L1 values
121 gx = vextq_f32(low_gx, high_gx, 1);
122 gy = vextq_f32(low_gy, high_gy, 1);
123
124 // Accumulate
125 gx2 = vmlaq_f32(gx2, gx, gx);
126 gy2 = vmlaq_f32(gy2, gy, gy);
127 gxgy = vmlaq_f32(gxgy, gx, gy);
128
129 // M values
130 gx = vextq_f32(low_gx, high_gx, 2);
131 gy = vextq_f32(low_gy, high_gy, 2);
132
133 // Accumulate
134 gx2 = vmlaq_f32(gx2, gx, gx);
135 gy2 = vmlaq_f32(gy2, gy, gy);
136 gxgy = vmlaq_f32(gxgy, gx, gy);
137
138 // R1 values
139 gx = vextq_f32(low_gx, high_gx, 3);
140 gy = vextq_f32(low_gy, high_gy, 3);
141
142 // Accumulate
143 gx2 = vmlaq_f32(gx2, gx, gx);
144 gy2 = vmlaq_f32(gy2, gy, gy);
145 gxgy = vmlaq_f32(gxgy, gx, gy);
146
147 // R2 values
148 gx = high_gx;
149 gy = high_gy;
150
151 // Accumulate
152 gx2 = vmlaq_f32(gx2, gx, gx);
153 gy2 = vmlaq_f32(gy2, gy, gy);
154 gxgy = vmlaq_f32(gxgy, gx, gy);
155}
156
157inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
158 float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
159{
160 // Normalize
161 low_gx = vmulq_f32(low_gx, norm_factor);
162 low_gy = vmulq_f32(low_gy, norm_factor);
163 high_gx = vmulq_f32(high_gx, norm_factor);
164 high_gy = vmulq_f32(high_gy, norm_factor);
165
166 // L3 values
167 float32x4_t gx = low_gx;
168 float32x4_t gy = low_gy;
169
170 // Accumulate
171 gx2 = vmlaq_f32(gx2, gx, gx);
172 gy2 = vmlaq_f32(gy2, gy, gy);
173 gxgy = vmlaq_f32(gxgy, gx, gy);
174
175 // L2 values
176 gx = vextq_f32(low_gx, high_gx, 1);
177 gy = vextq_f32(low_gy, high_gy, 1);
178
179 // Accumulate
180 gx2 = vmlaq_f32(gx2, gx, gx);
181 gy2 = vmlaq_f32(gy2, gy, gy);
182 gxgy = vmlaq_f32(gxgy, gx, gy);
183
184 // L1 values
185 gx = vextq_f32(low_gx, high_gx, 2);
186 gy = vextq_f32(low_gy, high_gy, 2);
187
188 // Accumulate
189 gx2 = vmlaq_f32(gx2, gx, gx);
190 gy2 = vmlaq_f32(gy2, gy, gy);
191 gxgy = vmlaq_f32(gxgy, gx, gy);
192
193 // M values
194 gx = vextq_f32(low_gx, high_gx, 3);
195 gy = vextq_f32(low_gy, high_gy, 3);
196
197 // Accumulate
198 gx2 = vmlaq_f32(gx2, gx, gx);
199 gy2 = vmlaq_f32(gy2, gy, gy);
200 gxgy = vmlaq_f32(gxgy, gx, gy);
201
202 // R1 values
203 gx = high_gx;
204 gy = high_gy;
205
206 // Accumulate
207 gx2 = vmlaq_f32(gx2, gx, gx);
208 gy2 = vmlaq_f32(gy2, gy, gy);
209 gxgy = vmlaq_f32(gxgy, gx, gy);
210
211 // Change tmp_low and tmp_high for calculating R2 and R3 values
212 low_gx = high_gx;
213 low_gy = high_gy;
214 high_gx = high_gx1;
215 high_gy = high_gy1;
216
217 // Normalize
218 high_gx = vmulq_f32(high_gx, norm_factor);
219 high_gy = vmulq_f32(high_gy, norm_factor);
220
221 // R2 values
222 gx = vextq_f32(low_gx, high_gx, 1);
223 gy = vextq_f32(low_gy, high_gy, 1);
224
225 // Accumulate
226 gx2 = vmlaq_f32(gx2, gx, gx);
227 gy2 = vmlaq_f32(gy2, gy, gy);
228 gxgy = vmlaq_f32(gxgy, gx, gy);
229
230 // R3 values
231 gx = vextq_f32(low_gx, high_gx, 2);
232 gy = vextq_f32(low_gy, high_gy, 2);
233
234 // Accumulate
235 gx2 = vmlaq_f32(gx2, gx, gx);
236 gy2 = vmlaq_f32(gy2, gy, gy);
237 gxgy = vmlaq_f32(gxgy, gx, gy);
238}
239
240inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
241 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
242
243{
244 const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
245 const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
246 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
247 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
248 const auto output = static_cast<float *__restrict>(output_ptr);
249
250 // Gx^2, Gy^2 and Gx*Gy
251 float32x4x2_t gx2 =
252 {
253 {
254 vdupq_n_f32(0.0f),
255 vdupq_n_f32(0.0f)
256 }
257 };
258 float32x4x2_t gy2 =
259 {
260 {
261 vdupq_n_f32(0.0f),
262 vdupq_n_f32(0.0f)
263 }
264 };
265 float32x4x2_t gxgy =
266 {
267 {
268 vdupq_n_f32(0.0f),
269 vdupq_n_f32(0.0f)
270 }
271 };
272
273 // Row0
274 int16x8x2_t tmp_gx =
275 {
276 {
277 vld1q_s16(gx_ptr_0 - input_stride),
278 vld1q_s16(gx_ptr_1 - input_stride)
279 }
280 };
281 int16x8x2_t tmp_gy =
282 {
283 {
284 vld1q_s16(gy_ptr_0 - input_stride),
285 vld1q_s16(gy_ptr_1 - input_stride)
286 }
287 };
288 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
289 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
290 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
291
292 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
293 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
294 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
295 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
296 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
297
298 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
299 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
300 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
301 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
302 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
303
304 // Row1
305 tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
306 tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
307 tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
308 tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
309
310 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
311 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
312 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
313 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
314 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
315
316 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
317 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
318 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
319 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
320 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
321
322 // Row2
323 tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
324 tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
325 tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
326 tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
327
328 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
329 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
330 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
331 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
332 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
333
334 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
335 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
336 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
337 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
338 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
339
340 // Calculate harris score
341 const float32x4x2_t mc =
342 {
343 {
344 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
345 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
346 }
347 };
348
349 // Store score
350 vst1q_f32(output + 0, mc.val[0]);
351 vst1q_f32(output + 4, mc.val[1]);
352}
353
354inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
355 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
356{
357 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
358 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
359 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
360 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
361 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
362 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
363 const auto output = static_cast<float *__restrict>(output_ptr);
364 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
365 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
366 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
367
368 // Gx^2, Gy^2 and Gx*Gy
369 float32x4x2_t gx2 =
370 {
371 {
372 vdupq_n_f32(0.0f),
373 vdupq_n_f32(0.0f)
374 }
375 };
376 float32x4x2_t gy2 =
377 {
378 {
379 vdupq_n_f32(0.0f),
380 vdupq_n_f32(0.0f)
381 }
382 };
383 float32x4x2_t gxgy =
384 {
385 {
386 vdupq_n_f32(0.0f),
387 vdupq_n_f32(0.0f)
388 }
389 };
390
391 // Row0
392 float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
393 float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
394 float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
395 float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
396 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
397
398 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
399 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
400 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
401 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
402 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
403
404 // Row1
405 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
406 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
407 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
408 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
409 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
410
411 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
412 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
413 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
414 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
415 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
416
417 // Row2
418 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
419 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
420 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
421 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
422 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
423
424 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
425 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
426 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
427 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
428 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
429
430 // Calculate harris score
431 const float32x4x2_t mc =
432 {
433 {
434 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
435 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
436 }
437 };
438
439 // Store score
440 vst1q_f32(output + 0, mc.val[0]);
441 vst1q_f32(output + 4, mc.val[1]);
442}
443
444inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
445 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
446{
447 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
448 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
449 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
450 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
451 const auto output = static_cast<float *__restrict>(output_ptr);
452
453 // Gx^2, Gy^2 and Gx*Gy
454 float32x4x2_t gx2 =
455 {
456 {
457 vdupq_n_f32(0.0f),
458 vdupq_n_f32(0.0f)
459 }
460 };
461 float32x4x2_t gy2 =
462 {
463 {
464 vdupq_n_f32(0.0f),
465 vdupq_n_f32(0.0f)
466 }
467 };
468 float32x4x2_t gxgy =
469 {
470 {
471 vdupq_n_f32(0.0f),
472 vdupq_n_f32(0.0f)
473 }
474 };
475 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
476 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
477 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
478
479 for(int i = 0; i < 5; ++i)
480 {
481 const int16x8x2_t tmp_gx =
482 {
483 {
484 vld1q_s16(gx_ptr_0),
485 vld1q_s16(gx_ptr_1)
486 }
487 };
488 const int16x8x2_t tmp_gy =
489 {
490 {
491 vld1q_s16(gy_ptr_0),
492 vld1q_s16(gy_ptr_1)
493 }
494 };
495
496 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
497 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
498 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
499 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
500 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
501
502 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
503 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
504 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
505 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
506 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
507
508 // Update gx and gy pointer
509 gx_ptr_0 += input_stride;
510 gy_ptr_0 += input_stride;
511 gx_ptr_1 += input_stride;
512 gy_ptr_1 += input_stride;
513 }
514
515 // Calculate harris score
516 const float32x4x2_t mc =
517 {
518 {
519 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
520 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
521 }
522 };
523
524 // Store score
525 vst1q_f32(output + 0, mc.val[0]);
526 vst1q_f32(output + 4, mc.val[1]);
527}
528
529inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
530 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
531
532{
533 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
534 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
535 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
536 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
537 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
538 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
539 const auto output = static_cast<float *__restrict>(output_ptr);
540
541 // Gx^2, Gy^2 and Gx*Gy
542 float32x4x2_t gx2 =
543 {
544 {
545 vdupq_n_f32(0.0f),
546 vdupq_n_f32(0.0f)
547 }
548 };
549 float32x4x2_t gy2 =
550 {
551 {
552 vdupq_n_f32(0.0f),
553 vdupq_n_f32(0.0f)
554 }
555 };
556 float32x4x2_t gxgy =
557 {
558 {
559 vdupq_n_f32(0.0f),
560 vdupq_n_f32(0.0f)
561 }
562 };
563 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
564 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
565 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
566
567 for(int i = 0; i < 5; ++i)
568 {
569 const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
570 const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
571 const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
572 const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
573 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
574
575 const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
576 const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
577 const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
578 const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
579 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
580
581 // Update gx and gy pointer
582 gx_ptr_0 += input_stride;
583 gy_ptr_0 += input_stride;
584 gx_ptr_1 += input_stride;
585 gy_ptr_1 += input_stride;
586 gx_ptr_2 += input_stride;
587 gy_ptr_2 += input_stride;
588 }
589
590 // Calculate harris score
591 const float32x4x2_t mc =
592 {
593 {
594 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
595 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
596 }
597 };
598
599 // Store score
600 vst1q_f32(output + 0, mc.val[0]);
601 vst1q_f32(output + 4, mc.val[1]);
602}
603
604inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
605 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
606{
607 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
608 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
609 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
610 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
611 const auto output = static_cast<float *__restrict>(output_ptr);
612
613 // Gx^2, Gy^2 and Gx*Gy
614 float32x4_t gx2 = vdupq_n_f32(0.0f);
615 float32x4_t gy2 = vdupq_n_f32(0.0f);
616 float32x4_t gxgy = vdupq_n_f32(0.0f);
617 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
618 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
619 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
620
621 for(int i = 0; i < 7; ++i)
622 {
623 const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
624 const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
625 const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
626 const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
627
628 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
629 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
630 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
631 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
632 float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
633 float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
634 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
635
636 // Update gx and gy pointer
637 gx_ptr_0 += input_stride;
638 gy_ptr_0 += input_stride;
639 gx_ptr_1 += input_stride;
640 gy_ptr_1 += input_stride;
641 }
642
643 // Calculate harris score
644 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
645
646 // Store score
647 vst1q_f32(output, mc);
648}
649
650inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
651 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
652{
653 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
654 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
655 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
656 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
657 const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
658 const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
659 const auto output = static_cast<float *__restrict>(output_ptr);
660
661 // Gx^2, Gy^2 and Gx*Gy
662 float32x4_t gx2 = vdupq_n_f32(0.0f);
663 float32x4_t gy2 = vdupq_n_f32(0.0f);
664 float32x4_t gxgy = vdupq_n_f32(0.0f);
665 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
666 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
667 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
668
669 for(int i = 0; i < 7; ++i)
670 {
671 const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
672 const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
673 const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
674 const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
675 const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
676 const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
677 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
678
679 // Update gx and gy pointer
680 gx_ptr_0 += input_stride;
681 gy_ptr_0 += input_stride;
682 gx_ptr_1 += input_stride;
683 gy_ptr_1 += input_stride;
684 gx_ptr_2 += input_stride;
685 gy_ptr_2 += input_stride;
686 }
687
688 // Calculate harris score
689 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
690
691 // Store score
692 vst1q_f32(output, mc);
693}
694
695} // namespace
696
697INEHarrisScoreKernel::INEHarrisScoreKernel()
698 : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
699{
700}
701
702template <int32_t block_size>
703NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
704 : INEHarrisScoreKernel(), _func(nullptr)
705{
706}
707
708template <int32_t block_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100709void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100710{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100711 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100712 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
713 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
714 ARM_COMPUTE_ERROR_ON(_func == nullptr);
715
716 Iterator input1(_input1, window);
717 Iterator input2(_input2, window);
718 Iterator output(_output, window);
719
720 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
721
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100722 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723 {
724 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
725 },
726 input1, input2, output);
727}
728
729template <int32_t block_size>
730BorderSize NEHarrisScoreKernel<block_size>::border_size() const
731{
732 return _border_size;
733}
734
735template <int32_t block_size>
736void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
737 bool border_undefined)
738{
739 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
740 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
741 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
742 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
743 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
744 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
745 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
746 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
747
748 _input1 = input1;
749 _input2 = input2;
750 _output = output;
751 _sensitivity = sensitivity;
752 _strength_thresh = strength_thresh;
753 _norm_factor = norm_factor;
754 _border_size = BorderSize(block_size / 2);
755
756 if(input1->info()->data_type() == DataType::S16)
757 {
758 switch(block_size)
759 {
760 case 3:
761 _func = &harris_score3x3_S16_S16_FLOAT;
762 break;
763 case 5:
764 _func = &harris_score5x5_S16_S16_FLOAT;
765 break;
766 case 7:
767 _func = &harris_score7x7_S16_S16_FLOAT;
768 break;
769 default:
770 ARM_COMPUTE_ERROR("Invalid block size");
771 break;
772 }
773 }
774 else
775 {
776 switch(block_size)
777 {
778 case 3:
779 _func = &harris_score3x3_S32_S32_FLOAT;
780 break;
781 case 5:
782 _func = &harris_score5x5_S32_S32_FLOAT;
783 break;
784 case 7:
785 _func = &harris_score7x7_S32_S32_FLOAT;
786 break;
787 default:
788 ARM_COMPUTE_ERROR("Invalid block size");
789 break;
790 }
791 }
792
793 ARM_COMPUTE_ERROR_ON(nullptr == _func);
794
795 constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
796 constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
797 constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
798 constexpr unsigned int num_rows_read_per_iteration = block_size;
799
800 // Configure kernel window
801 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
802 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
803
804 update_window_and_padding(win,
805 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
806 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
807 output_access);
808
809 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
810 input2->info()->valid_region());
811
812 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
813
814 INEKernel::configure(win);
815}