blob: d44f4ce3b8c72827b8cc3082910c2bc3e6333d23 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cmath>
38#include <cstddef>
39
40using namespace arm_compute;
41
42#ifdef ARM_COMPUTE_ENABLE_FP16
43
44template class arm_compute::NEHarrisScoreFP16Kernel<3>;
45template class arm_compute::NEHarrisScoreFP16Kernel<5>;
46template class arm_compute::NEHarrisScoreFP16Kernel<7>;
47
48namespace fp16
49{
50inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
51{
52 static const float16x8_t zero = vdupq_n_f16(0.f);
53
54 // Trace^2
55 float16x8_t trace2 = vaddq_f16(gx2, gy2);
56 trace2 = vmulq_f16(trace2, trace2);
57
58 // Det(A)
59 float16x8_t det = vmulq_f16(gx2, gy2);
60 det = vfmsq_f16(det, gxgy, gxgy);
61
62 // Det(A) - sensitivity * trace^2
63 const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
64
65 // mc > strength_thresh
66 const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
67
68 return vbslq_f16(mask, mc, zero);
69}
70
71template <size_t block_size>
72inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
73 float norm_factor)
74{
75 const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
76
77 // Normalize
78 low_gx = vmulq_f16(low_gx, norm_factor_fp16);
79 low_gy = vmulq_f16(low_gy, norm_factor_fp16);
80 high_gx = vmulq_f16(high_gx, norm_factor_fp16);
81 high_gy = vmulq_f16(high_gy, norm_factor_fp16);
82
83 float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
84 float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
85
86 gx2 = vfmaq_f16(gx2, gx, gx);
87 gy2 = vfmaq_f16(gy2, gy, gy);
88 gxgy = vfmaq_f16(gxgy, gx, gy);
89
90 gx = vextq_f16(low_gx, high_gx, 1);
91 gy = vextq_f16(low_gy, high_gy, 1);
92
93 gx2 = vfmaq_f16(gx2, gx, gx);
94 gy2 = vfmaq_f16(gy2, gy, gy);
95 gxgy = vfmaq_f16(gxgy, gx, gy);
96
97 gx = vextq_f16(low_gx, high_gx, 2);
98 gy = vextq_f16(low_gy, high_gy, 2);
99
100 gx2 = vfmaq_f16(gx2, gx, gx);
101 gy2 = vfmaq_f16(gy2, gy, gy);
102 gxgy = vfmaq_f16(gxgy, gx, gy);
103
104 if(block_size > 3)
105 {
106 gx = vextq_f16(low_gx, high_gx, 3);
107 gy = vextq_f16(low_gy, high_gy, 3);
108
109 gx2 = vfmaq_f16(gx2, gx, gx);
110 gy2 = vfmaq_f16(gy2, gy, gy);
111 gxgy = vfmaq_f16(gxgy, gx, gy);
112
113 gx = vextq_f16(low_gx, high_gx, 4);
114 gy = vextq_f16(low_gy, high_gy, 4);
115
116 gx2 = vfmaq_f16(gx2, gx, gx);
117 gy2 = vfmaq_f16(gy2, gy, gy);
118 gxgy = vfmaq_f16(gxgy, gx, gy);
119 }
120
121 if(block_size == 7)
122 {
123 gx = vextq_f16(low_gx, high_gx, 5);
124 gy = vextq_f16(low_gy, high_gy, 5);
125
126 gx2 = vfmaq_f16(gx2, gx, gx);
127 gy2 = vfmaq_f16(gy2, gy, gy);
128 gxgy = vfmaq_f16(gxgy, gx, gy);
129
130 gx = vextq_f16(low_gx, high_gx, 6);
131 gy = vextq_f16(low_gy, high_gy, 6);
132
133 gx2 = vfmaq_f16(gx2, gx, gx);
134 gy2 = vfmaq_f16(gy2, gy, gy);
135 gxgy = vfmaq_f16(gxgy, gx, gy);
136 }
137}
138
139template <size_t block_size>
140inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
141 float strength_thresh)
142{
143 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
144 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
145 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
146 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
147 const auto output = static_cast<float *__restrict>(out_ptr);
148
149 // Gx^2, Gy^2 and Gx*Gy
150 float16x8_t gx2 = vdupq_n_f16(0.0f);
151 float16x8_t gy2 = vdupq_n_f16(0.0f);
152 float16x8_t gxgy = vdupq_n_f16(0.0f);
153
154 for(size_t i = 0; i < block_size; ++i)
155 {
156 const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
157 const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
158 const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
159 const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
160 harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
161
162 // Update gx and gy pointer
163 gx_ptr_0 += in_stride;
164 gy_ptr_0 += in_stride;
165 gx_ptr_1 += in_stride;
166 gy_ptr_1 += in_stride;
167 }
168
169 // Calculate harris score
170 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
171
172 // Store score
173 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
174 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
175}
176
177template <size_t block_size>
178inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
179 float strength_thresh)
180{
181 static const float16x8_t zero = vdupq_n_f16(0.0f);
182
183 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
184 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
185 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
186 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
187 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
188 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
189 const auto output = static_cast<float *__restrict>(out_ptr);
190
191 // Gx^2, Gy^2 and Gx*Gy
192 float16x8_t gx2 = zero;
193 float16x8_t gy2 = zero;
194 float16x8_t gxgy = zero;
195
196 for(size_t i = 0; i < block_size; ++i)
197 {
198 const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
199 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
200 const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
201 vget_low_f16(zero));
202 const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
203 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
204 const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
205 vget_low_f16(zero));
206 harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
207
208 // Update gx and gy pointer
209 gx_ptr_0 += in_stride;
210 gy_ptr_0 += in_stride;
211 gx_ptr_1 += in_stride;
212 gy_ptr_1 += in_stride;
213 gx_ptr_2 += in_stride;
214 gy_ptr_2 += in_stride;
215 }
216
217 // Calculate harris score
218 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
219
220 // Store score
221 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
222 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
223}
224
225template <>
226inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
227 float strength_thresh)
228{
229 static const float16x8_t zero = vdupq_n_f16(0.0f);
230
231 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
232 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
233 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
234 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
235 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
236 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
237 const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
238 const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
239 const auto output = static_cast<float *__restrict>(out_ptr);
240
241 // Gx^2, Gy^2 and Gx*Gy
242 float16x8_t gx2 = zero;
243 float16x8_t gy2 = zero;
244 float16x8_t gxgy = zero;
245
246 for(size_t i = 0; i < 7; ++i)
247 {
248 const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
249 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
250 const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
251 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
252 const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
253 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
254 const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
255 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
256 harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
257
258 // Update gx and gy pointer
259 gx_ptr_0 += in_stride;
260 gy_ptr_0 += in_stride;
261 gx_ptr_1 += in_stride;
262 gy_ptr_1 += in_stride;
263 gx_ptr_2 += in_stride;
264 gy_ptr_2 += in_stride;
265 }
266
267 // Calculate harris score
268 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
269
270 // Store score
271 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
272 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
273}
274
275} // namespace fp16
276
277template <int32_t block_size>
278BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
279{
280 return _border_size;
281}
282
283template <int32_t block_size>
284NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
285 : INEHarrisScoreKernel(), _func(nullptr)
286{
287}
288
289template <int32_t block_size>
290void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
291{
292 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
293 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
294 ARM_COMPUTE_ERROR_ON(_func == nullptr);
295
296 Iterator input1(_input1, window);
297 Iterator input2(_input2, window);
298 Iterator output(_output, window);
299
300 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
301
302 execute_window_loop(window, [&](const Coordinates & id)
303 {
304 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
305 },
306 input1, input2, output);
307}
308
309template <int32_t block_size>
310void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
311 bool border_undefined)
312{
313 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
314 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
315 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
316 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
317 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
318 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
319 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
320 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
321
322 _input1 = input1;
323 _input2 = input2;
324 _output = output;
325 _sensitivity = sensitivity;
326 _strength_thresh = strength_thresh;
327 _norm_factor = norm_factor;
328 _border_size = BorderSize(block_size / 2);
329
330 if(input1->info()->data_type() == DataType::S16)
331 {
332 _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
333 }
334 else
335 {
336 _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
337 }
338
339 ARM_COMPUTE_ERROR_ON(nullptr == _func);
340
341 constexpr unsigned int num_elems_processed_per_iteration = 8;
342 constexpr unsigned int num_elems_read_per_iteration = 16;
343 constexpr unsigned int num_elems_written_per_iteration = 8;
344 constexpr unsigned int num_rows_read_per_iteration = block_size;
345
346 // Configure kernel window
347 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
348 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
349
350 update_window_and_padding(win,
351 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
352 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
353 output_access);
354
355 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
356 input2->info()->valid_region());
357
358 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
359
360 INEKernel::configure(win);
361}
362
Anthony Barbierac69aa12017-07-03 17:39:37 +0100363#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364
365template class arm_compute::NEHarrisScoreKernel<3>;
366template class arm_compute::NEHarrisScoreKernel<5>;
367template class arm_compute::NEHarrisScoreKernel<7>;
368template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
369template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
370template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
371
372namespace
373{
374inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
375{
376 // Trace^2
377 float32x4_t trace2 = vaddq_f32(gx2, gy2);
378 trace2 = vmulq_f32(trace2, trace2);
379
380 // Det(A)
381 float32x4_t det = vmulq_f32(gx2, gy2);
382 det = vmlsq_f32(det, gxgy, gxgy);
383
384 // Det(A) - sensitivity * trace^2
385 const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
386
387 // mc > strength_thresh
388 const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
389
390 return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
391}
392
393inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
394 float32x4_t norm_factor)
395{
396 // Normalize
397 low_gx = vmulq_f32(low_gx, norm_factor);
398 low_gy = vmulq_f32(low_gy, norm_factor);
399 high_gx = vmulq_f32(high_gx, norm_factor);
400 high_gy = vmulq_f32(high_gy, norm_factor);
401
402 const float32x4_t l_gx = low_gx;
403 const float32x4_t l_gy = low_gy;
404 const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
405 const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
406 const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
407 const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
408
409 // Gx*Gx
410 gx2 = vmlaq_f32(gx2, l_gx, l_gx);
411 gx2 = vmlaq_f32(gx2, m_gx, m_gx);
412 gx2 = vmlaq_f32(gx2, r_gx, r_gx);
413
414 // Gy*Gy
415 gy2 = vmlaq_f32(gy2, l_gy, l_gy);
416 gy2 = vmlaq_f32(gy2, m_gy, m_gy);
417 gy2 = vmlaq_f32(gy2, r_gy, r_gy);
418
419 // Gx*Gy
420 gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
421 gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
422 gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
423}
424
425inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
426 float32x4_t norm_factor)
427{
428 // Normalize
429 low_gx = vmulq_f32(low_gx, norm_factor);
430 low_gy = vmulq_f32(low_gy, norm_factor);
431 high_gx = vmulq_f32(high_gx, norm_factor);
432 high_gy = vmulq_f32(high_gy, norm_factor);
433
434 // L2 values
435 float32x4_t gx = low_gx;
436 float32x4_t gy = low_gy;
437
438 // Accumulate
439 gx2 = vmlaq_f32(gx2, gx, gx);
440 gy2 = vmlaq_f32(gy2, gy, gy);
441 gxgy = vmlaq_f32(gxgy, gx, gy);
442
443 // L1 values
444 gx = vextq_f32(low_gx, high_gx, 1);
445 gy = vextq_f32(low_gy, high_gy, 1);
446
447 // Accumulate
448 gx2 = vmlaq_f32(gx2, gx, gx);
449 gy2 = vmlaq_f32(gy2, gy, gy);
450 gxgy = vmlaq_f32(gxgy, gx, gy);
451
452 // M values
453 gx = vextq_f32(low_gx, high_gx, 2);
454 gy = vextq_f32(low_gy, high_gy, 2);
455
456 // Accumulate
457 gx2 = vmlaq_f32(gx2, gx, gx);
458 gy2 = vmlaq_f32(gy2, gy, gy);
459 gxgy = vmlaq_f32(gxgy, gx, gy);
460
461 // R1 values
462 gx = vextq_f32(low_gx, high_gx, 3);
463 gy = vextq_f32(low_gy, high_gy, 3);
464
465 // Accumulate
466 gx2 = vmlaq_f32(gx2, gx, gx);
467 gy2 = vmlaq_f32(gy2, gy, gy);
468 gxgy = vmlaq_f32(gxgy, gx, gy);
469
470 // R2 values
471 gx = high_gx;
472 gy = high_gy;
473
474 // Accumulate
475 gx2 = vmlaq_f32(gx2, gx, gx);
476 gy2 = vmlaq_f32(gy2, gy, gy);
477 gxgy = vmlaq_f32(gxgy, gx, gy);
478}
479
480inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
481 float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
482{
483 // Normalize
484 low_gx = vmulq_f32(low_gx, norm_factor);
485 low_gy = vmulq_f32(low_gy, norm_factor);
486 high_gx = vmulq_f32(high_gx, norm_factor);
487 high_gy = vmulq_f32(high_gy, norm_factor);
488
489 // L3 values
490 float32x4_t gx = low_gx;
491 float32x4_t gy = low_gy;
492
493 // Accumulate
494 gx2 = vmlaq_f32(gx2, gx, gx);
495 gy2 = vmlaq_f32(gy2, gy, gy);
496 gxgy = vmlaq_f32(gxgy, gx, gy);
497
498 // L2 values
499 gx = vextq_f32(low_gx, high_gx, 1);
500 gy = vextq_f32(low_gy, high_gy, 1);
501
502 // Accumulate
503 gx2 = vmlaq_f32(gx2, gx, gx);
504 gy2 = vmlaq_f32(gy2, gy, gy);
505 gxgy = vmlaq_f32(gxgy, gx, gy);
506
507 // L1 values
508 gx = vextq_f32(low_gx, high_gx, 2);
509 gy = vextq_f32(low_gy, high_gy, 2);
510
511 // Accumulate
512 gx2 = vmlaq_f32(gx2, gx, gx);
513 gy2 = vmlaq_f32(gy2, gy, gy);
514 gxgy = vmlaq_f32(gxgy, gx, gy);
515
516 // M values
517 gx = vextq_f32(low_gx, high_gx, 3);
518 gy = vextq_f32(low_gy, high_gy, 3);
519
520 // Accumulate
521 gx2 = vmlaq_f32(gx2, gx, gx);
522 gy2 = vmlaq_f32(gy2, gy, gy);
523 gxgy = vmlaq_f32(gxgy, gx, gy);
524
525 // R1 values
526 gx = high_gx;
527 gy = high_gy;
528
529 // Accumulate
530 gx2 = vmlaq_f32(gx2, gx, gx);
531 gy2 = vmlaq_f32(gy2, gy, gy);
532 gxgy = vmlaq_f32(gxgy, gx, gy);
533
534 // Change tmp_low and tmp_high for calculating R2 and R3 values
535 low_gx = high_gx;
536 low_gy = high_gy;
537 high_gx = high_gx1;
538 high_gy = high_gy1;
539
540 // Normalize
541 high_gx = vmulq_f32(high_gx, norm_factor);
542 high_gy = vmulq_f32(high_gy, norm_factor);
543
544 // R2 values
545 gx = vextq_f32(low_gx, high_gx, 1);
546 gy = vextq_f32(low_gy, high_gy, 1);
547
548 // Accumulate
549 gx2 = vmlaq_f32(gx2, gx, gx);
550 gy2 = vmlaq_f32(gy2, gy, gy);
551 gxgy = vmlaq_f32(gxgy, gx, gy);
552
553 // R3 values
554 gx = vextq_f32(low_gx, high_gx, 2);
555 gy = vextq_f32(low_gy, high_gy, 2);
556
557 // Accumulate
558 gx2 = vmlaq_f32(gx2, gx, gx);
559 gy2 = vmlaq_f32(gy2, gy, gy);
560 gxgy = vmlaq_f32(gxgy, gx, gy);
561}
562
563inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
564 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
565
566{
567 const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
568 const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
569 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
570 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
571 const auto output = static_cast<float *__restrict>(output_ptr);
572
573 // Gx^2, Gy^2 and Gx*Gy
574 float32x4x2_t gx2 =
575 {
576 {
577 vdupq_n_f32(0.0f),
578 vdupq_n_f32(0.0f)
579 }
580 };
581 float32x4x2_t gy2 =
582 {
583 {
584 vdupq_n_f32(0.0f),
585 vdupq_n_f32(0.0f)
586 }
587 };
588 float32x4x2_t gxgy =
589 {
590 {
591 vdupq_n_f32(0.0f),
592 vdupq_n_f32(0.0f)
593 }
594 };
595
596 // Row0
597 int16x8x2_t tmp_gx =
598 {
599 {
600 vld1q_s16(gx_ptr_0 - input_stride),
601 vld1q_s16(gx_ptr_1 - input_stride)
602 }
603 };
604 int16x8x2_t tmp_gy =
605 {
606 {
607 vld1q_s16(gy_ptr_0 - input_stride),
608 vld1q_s16(gy_ptr_1 - input_stride)
609 }
610 };
611 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
612 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
613 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
614
615 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
616 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
617 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
618 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
619 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
620
621 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
622 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
623 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
624 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
625 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
626
627 // Row1
628 tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
629 tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
630 tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
631 tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
632
633 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
634 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
635 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
636 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
637 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
638
639 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
640 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
641 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
642 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
643 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
644
645 // Row2
646 tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
647 tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
648 tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
649 tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
650
651 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
652 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
653 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
654 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
655 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
656
657 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
658 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
659 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
660 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
661 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
662
663 // Calculate harris score
664 const float32x4x2_t mc =
665 {
666 {
667 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
668 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
669 }
670 };
671
672 // Store score
673 vst1q_f32(output + 0, mc.val[0]);
674 vst1q_f32(output + 4, mc.val[1]);
675}
676
677inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
678 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
679{
680 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
681 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
682 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
683 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
684 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
685 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
686 const auto output = static_cast<float *__restrict>(output_ptr);
687 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
688 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
689 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
690
691 // Gx^2, Gy^2 and Gx*Gy
692 float32x4x2_t gx2 =
693 {
694 {
695 vdupq_n_f32(0.0f),
696 vdupq_n_f32(0.0f)
697 }
698 };
699 float32x4x2_t gy2 =
700 {
701 {
702 vdupq_n_f32(0.0f),
703 vdupq_n_f32(0.0f)
704 }
705 };
706 float32x4x2_t gxgy =
707 {
708 {
709 vdupq_n_f32(0.0f),
710 vdupq_n_f32(0.0f)
711 }
712 };
713
714 // Row0
715 float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
716 float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
717 float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
718 float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
719 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
720
721 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
722 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
723 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
724 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
725 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
726
727 // Row1
728 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
729 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
730 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
731 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
732 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
733
734 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
735 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
736 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
737 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
738 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
739
740 // Row2
741 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
742 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
743 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
744 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
745 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
746
747 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
748 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
749 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
750 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
751 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
752
753 // Calculate harris score
754 const float32x4x2_t mc =
755 {
756 {
757 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
758 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
759 }
760 };
761
762 // Store score
763 vst1q_f32(output + 0, mc.val[0]);
764 vst1q_f32(output + 4, mc.val[1]);
765}
766
767inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
768 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
769{
770 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
771 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
772 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
773 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
774 const auto output = static_cast<float *__restrict>(output_ptr);
775
776 // Gx^2, Gy^2 and Gx*Gy
777 float32x4x2_t gx2 =
778 {
779 {
780 vdupq_n_f32(0.0f),
781 vdupq_n_f32(0.0f)
782 }
783 };
784 float32x4x2_t gy2 =
785 {
786 {
787 vdupq_n_f32(0.0f),
788 vdupq_n_f32(0.0f)
789 }
790 };
791 float32x4x2_t gxgy =
792 {
793 {
794 vdupq_n_f32(0.0f),
795 vdupq_n_f32(0.0f)
796 }
797 };
798 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
799 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
800 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
801
802 for(int i = 0; i < 5; ++i)
803 {
804 const int16x8x2_t tmp_gx =
805 {
806 {
807 vld1q_s16(gx_ptr_0),
808 vld1q_s16(gx_ptr_1)
809 }
810 };
811 const int16x8x2_t tmp_gy =
812 {
813 {
814 vld1q_s16(gy_ptr_0),
815 vld1q_s16(gy_ptr_1)
816 }
817 };
818
819 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
820 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
821 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
822 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
823 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
824
825 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
826 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
827 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
828 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
829 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
830
831 // Update gx and gy pointer
832 gx_ptr_0 += input_stride;
833 gy_ptr_0 += input_stride;
834 gx_ptr_1 += input_stride;
835 gy_ptr_1 += input_stride;
836 }
837
838 // Calculate harris score
839 const float32x4x2_t mc =
840 {
841 {
842 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
843 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
844 }
845 };
846
847 // Store score
848 vst1q_f32(output + 0, mc.val[0]);
849 vst1q_f32(output + 4, mc.val[1]);
850}
851
852inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
853 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
854
855{
856 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
857 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
858 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
859 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
860 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
861 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
862 const auto output = static_cast<float *__restrict>(output_ptr);
863
864 // Gx^2, Gy^2 and Gx*Gy
865 float32x4x2_t gx2 =
866 {
867 {
868 vdupq_n_f32(0.0f),
869 vdupq_n_f32(0.0f)
870 }
871 };
872 float32x4x2_t gy2 =
873 {
874 {
875 vdupq_n_f32(0.0f),
876 vdupq_n_f32(0.0f)
877 }
878 };
879 float32x4x2_t gxgy =
880 {
881 {
882 vdupq_n_f32(0.0f),
883 vdupq_n_f32(0.0f)
884 }
885 };
886 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
887 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
888 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
889
890 for(int i = 0; i < 5; ++i)
891 {
892 const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
893 const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
894 const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
895 const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
896 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
897
898 const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
899 const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
900 const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
901 const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
902 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
903
904 // Update gx and gy pointer
905 gx_ptr_0 += input_stride;
906 gy_ptr_0 += input_stride;
907 gx_ptr_1 += input_stride;
908 gy_ptr_1 += input_stride;
909 gx_ptr_2 += input_stride;
910 gy_ptr_2 += input_stride;
911 }
912
913 // Calculate harris score
914 const float32x4x2_t mc =
915 {
916 {
917 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
918 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
919 }
920 };
921
922 // Store score
923 vst1q_f32(output + 0, mc.val[0]);
924 vst1q_f32(output + 4, mc.val[1]);
925}
926
927inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
928 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
929{
930 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
931 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
932 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
933 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
934 const auto output = static_cast<float *__restrict>(output_ptr);
935
936 // Gx^2, Gy^2 and Gx*Gy
937 float32x4_t gx2 = vdupq_n_f32(0.0f);
938 float32x4_t gy2 = vdupq_n_f32(0.0f);
939 float32x4_t gxgy = vdupq_n_f32(0.0f);
940 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
941 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
942 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
943
944 for(int i = 0; i < 7; ++i)
945 {
946 const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
947 const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
948 const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
949 const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
950
951 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
952 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
953 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
954 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
955 float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
956 float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
957 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
958
959 // Update gx and gy pointer
960 gx_ptr_0 += input_stride;
961 gy_ptr_0 += input_stride;
962 gx_ptr_1 += input_stride;
963 gy_ptr_1 += input_stride;
964 }
965
966 // Calculate harris score
967 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
968
969 // Store score
970 vst1q_f32(output, mc);
971}
972
973inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
974 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
975{
976 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
977 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
978 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
979 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
980 const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
981 const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
982 const auto output = static_cast<float *__restrict>(output_ptr);
983
984 // Gx^2, Gy^2 and Gx*Gy
985 float32x4_t gx2 = vdupq_n_f32(0.0f);
986 float32x4_t gy2 = vdupq_n_f32(0.0f);
987 float32x4_t gxgy = vdupq_n_f32(0.0f);
988 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
989 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
990 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
991
992 for(int i = 0; i < 7; ++i)
993 {
994 const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
995 const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
996 const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
997 const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
998 const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
999 const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
1000 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
1001
1002 // Update gx and gy pointer
1003 gx_ptr_0 += input_stride;
1004 gy_ptr_0 += input_stride;
1005 gx_ptr_1 += input_stride;
1006 gy_ptr_1 += input_stride;
1007 gx_ptr_2 += input_stride;
1008 gy_ptr_2 += input_stride;
1009 }
1010
1011 // Calculate harris score
1012 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
1013
1014 // Store score
1015 vst1q_f32(output, mc);
1016}
1017
1018} // namespace
1019
1020INEHarrisScoreKernel::INEHarrisScoreKernel()
1021 : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
1022{
1023}
1024
1025template <int32_t block_size>
1026NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
1027 : INEHarrisScoreKernel(), _func(nullptr)
1028{
1029}
1030
1031template <int32_t block_size>
1032void NEHarrisScoreKernel<block_size>::run(const Window &window)
1033{
1034 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1035 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1036 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1037
1038 Iterator input1(_input1, window);
1039 Iterator input2(_input2, window);
1040 Iterator output(_output, window);
1041
1042 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
1043
1044 execute_window_loop(window, [&](const Coordinates & id)
1045 {
1046 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
1047 },
1048 input1, input2, output);
1049}
1050
1051template <int32_t block_size>
1052BorderSize NEHarrisScoreKernel<block_size>::border_size() const
1053{
1054 return _border_size;
1055}
1056
1057template <int32_t block_size>
1058void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
1059 bool border_undefined)
1060{
1061 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
1062 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
1063 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
1064 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
1065 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
1066 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
1067 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
1068 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
1069
1070 _input1 = input1;
1071 _input2 = input2;
1072 _output = output;
1073 _sensitivity = sensitivity;
1074 _strength_thresh = strength_thresh;
1075 _norm_factor = norm_factor;
1076 _border_size = BorderSize(block_size / 2);
1077
1078 if(input1->info()->data_type() == DataType::S16)
1079 {
1080 switch(block_size)
1081 {
1082 case 3:
1083 _func = &harris_score3x3_S16_S16_FLOAT;
1084 break;
1085 case 5:
1086 _func = &harris_score5x5_S16_S16_FLOAT;
1087 break;
1088 case 7:
1089 _func = &harris_score7x7_S16_S16_FLOAT;
1090 break;
1091 default:
1092 ARM_COMPUTE_ERROR("Invalid block size");
1093 break;
1094 }
1095 }
1096 else
1097 {
1098 switch(block_size)
1099 {
1100 case 3:
1101 _func = &harris_score3x3_S32_S32_FLOAT;
1102 break;
1103 case 5:
1104 _func = &harris_score5x5_S32_S32_FLOAT;
1105 break;
1106 case 7:
1107 _func = &harris_score7x7_S32_S32_FLOAT;
1108 break;
1109 default:
1110 ARM_COMPUTE_ERROR("Invalid block size");
1111 break;
1112 }
1113 }
1114
1115 ARM_COMPUTE_ERROR_ON(nullptr == _func);
1116
1117 constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
1118 constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
1119 constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
1120 constexpr unsigned int num_rows_read_per_iteration = block_size;
1121
1122 // Configure kernel window
1123 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
1124 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
1125
1126 update_window_and_padding(win,
1127 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
1128 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
1129 output_access);
1130
1131 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
1132 input2->info()->valid_region());
1133
1134 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
1135
1136 INEKernel::configure(win);
1137}