blob: 233b2baabe690c3c5dd98d85e929df6ccfcf7b40 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cmath>
38#include <cstddef>
39
40using namespace arm_compute;
41
42#ifdef ARM_COMPUTE_ENABLE_FP16
43
44template class arm_compute::NEHarrisScoreFP16Kernel<3>;
45template class arm_compute::NEHarrisScoreFP16Kernel<5>;
46template class arm_compute::NEHarrisScoreFP16Kernel<7>;
47
48namespace fp16
49{
50inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
51{
52 static const float16x8_t zero = vdupq_n_f16(0.f);
53
54 // Trace^2
55 float16x8_t trace2 = vaddq_f16(gx2, gy2);
56 trace2 = vmulq_f16(trace2, trace2);
57
58 // Det(A)
59 float16x8_t det = vmulq_f16(gx2, gy2);
60 det = vfmsq_f16(det, gxgy, gxgy);
61
62 // Det(A) - sensitivity * trace^2
63 const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
64
65 // mc > strength_thresh
66 const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
67
68 return vbslq_f16(mask, mc, zero);
69}
70
71template <size_t block_size>
72inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
73 float norm_factor)
74{
75 const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
76
77 // Normalize
78 low_gx = vmulq_f16(low_gx, norm_factor_fp16);
79 low_gy = vmulq_f16(low_gy, norm_factor_fp16);
80 high_gx = vmulq_f16(high_gx, norm_factor_fp16);
81 high_gy = vmulq_f16(high_gy, norm_factor_fp16);
82
83 float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
84 float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
85
86 gx2 = vfmaq_f16(gx2, gx, gx);
87 gy2 = vfmaq_f16(gy2, gy, gy);
88 gxgy = vfmaq_f16(gxgy, gx, gy);
89
90 gx = vextq_f16(low_gx, high_gx, 1);
91 gy = vextq_f16(low_gy, high_gy, 1);
92
93 gx2 = vfmaq_f16(gx2, gx, gx);
94 gy2 = vfmaq_f16(gy2, gy, gy);
95 gxgy = vfmaq_f16(gxgy, gx, gy);
96
97 gx = vextq_f16(low_gx, high_gx, 2);
98 gy = vextq_f16(low_gy, high_gy, 2);
99
100 gx2 = vfmaq_f16(gx2, gx, gx);
101 gy2 = vfmaq_f16(gy2, gy, gy);
102 gxgy = vfmaq_f16(gxgy, gx, gy);
103
104 if(block_size > 3)
105 {
106 gx = vextq_f16(low_gx, high_gx, 3);
107 gy = vextq_f16(low_gy, high_gy, 3);
108
109 gx2 = vfmaq_f16(gx2, gx, gx);
110 gy2 = vfmaq_f16(gy2, gy, gy);
111 gxgy = vfmaq_f16(gxgy, gx, gy);
112
113 gx = vextq_f16(low_gx, high_gx, 4);
114 gy = vextq_f16(low_gy, high_gy, 4);
115
116 gx2 = vfmaq_f16(gx2, gx, gx);
117 gy2 = vfmaq_f16(gy2, gy, gy);
118 gxgy = vfmaq_f16(gxgy, gx, gy);
119 }
120
121 if(block_size == 7)
122 {
123 gx = vextq_f16(low_gx, high_gx, 5);
124 gy = vextq_f16(low_gy, high_gy, 5);
125
126 gx2 = vfmaq_f16(gx2, gx, gx);
127 gy2 = vfmaq_f16(gy2, gy, gy);
128 gxgy = vfmaq_f16(gxgy, gx, gy);
129
130 gx = vextq_f16(low_gx, high_gx, 6);
131 gy = vextq_f16(low_gy, high_gy, 6);
132
133 gx2 = vfmaq_f16(gx2, gx, gx);
134 gy2 = vfmaq_f16(gy2, gy, gy);
135 gxgy = vfmaq_f16(gxgy, gx, gy);
136 }
137}
138
139template <size_t block_size>
140inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
141 float strength_thresh)
142{
143 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
144 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
145 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
146 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
147 const auto output = static_cast<float *__restrict>(out_ptr);
148
149 // Gx^2, Gy^2 and Gx*Gy
150 float16x8_t gx2 = vdupq_n_f16(0.0f);
151 float16x8_t gy2 = vdupq_n_f16(0.0f);
152 float16x8_t gxgy = vdupq_n_f16(0.0f);
153
154 for(size_t i = 0; i < block_size; ++i)
155 {
156 const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
157 const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
158 const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
159 const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
160 harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
161
162 // Update gx and gy pointer
163 gx_ptr_0 += in_stride;
164 gy_ptr_0 += in_stride;
165 gx_ptr_1 += in_stride;
166 gy_ptr_1 += in_stride;
167 }
168
169 // Calculate harris score
170 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
171
172 // Store score
173 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
174 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
175}
176
177template <size_t block_size>
178inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
179 float strength_thresh)
180{
181 static const float16x8_t zero = vdupq_n_f16(0.0f);
182
183 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
184 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
185 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
186 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
187 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
188 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
189 const auto output = static_cast<float *__restrict>(out_ptr);
190
191 // Gx^2, Gy^2 and Gx*Gy
192 float16x8_t gx2 = zero;
193 float16x8_t gy2 = zero;
194 float16x8_t gxgy = zero;
195
196 for(size_t i = 0; i < block_size; ++i)
197 {
198 const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
199 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
200 const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
201 vget_low_f16(zero));
202 const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
203 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
204 const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
205 vget_low_f16(zero));
206 harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
207
208 // Update gx and gy pointer
209 gx_ptr_0 += in_stride;
210 gy_ptr_0 += in_stride;
211 gx_ptr_1 += in_stride;
212 gy_ptr_1 += in_stride;
213 gx_ptr_2 += in_stride;
214 gy_ptr_2 += in_stride;
215 }
216
217 // Calculate harris score
218 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
219
220 // Store score
221 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
222 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
223}
224
225template <>
226inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
227 float strength_thresh)
228{
229 static const float16x8_t zero = vdupq_n_f16(0.0f);
230
231 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
232 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
233 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
234 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
235 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
236 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
237 const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
238 const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
239 const auto output = static_cast<float *__restrict>(out_ptr);
240
241 // Gx^2, Gy^2 and Gx*Gy
242 float16x8_t gx2 = zero;
243 float16x8_t gy2 = zero;
244 float16x8_t gxgy = zero;
245
246 for(size_t i = 0; i < 7; ++i)
247 {
248 const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
249 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
250 const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
251 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
252 const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
253 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
254 const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
255 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
256 harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
257
258 // Update gx and gy pointer
259 gx_ptr_0 += in_stride;
260 gy_ptr_0 += in_stride;
261 gx_ptr_1 += in_stride;
262 gy_ptr_1 += in_stride;
263 gx_ptr_2 += in_stride;
264 gy_ptr_2 += in_stride;
265 }
266
267 // Calculate harris score
268 const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
269
270 // Store score
271 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
272 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
273}
274
275} // namespace fp16
276
277template <int32_t block_size>
278BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
279{
280 return _border_size;
281}
282
283template <int32_t block_size>
284NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
285 : INEHarrisScoreKernel(), _func(nullptr)
286{
287}
288
289template <int32_t block_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100290void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100291{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100292 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100293 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
294 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
295 ARM_COMPUTE_ERROR_ON(_func == nullptr);
296
297 Iterator input1(_input1, window);
298 Iterator input2(_input2, window);
299 Iterator output(_output, window);
300
301 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
302
303 execute_window_loop(window, [&](const Coordinates & id)
304 {
305 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
306 },
307 input1, input2, output);
308}
309
310template <int32_t block_size>
311void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
312 bool border_undefined)
313{
314 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
315 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
316 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
317 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
318 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
319 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
320 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
321 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
322
323 _input1 = input1;
324 _input2 = input2;
325 _output = output;
326 _sensitivity = sensitivity;
327 _strength_thresh = strength_thresh;
328 _norm_factor = norm_factor;
329 _border_size = BorderSize(block_size / 2);
330
331 if(input1->info()->data_type() == DataType::S16)
332 {
333 _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
334 }
335 else
336 {
337 _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
338 }
339
340 ARM_COMPUTE_ERROR_ON(nullptr == _func);
341
342 constexpr unsigned int num_elems_processed_per_iteration = 8;
343 constexpr unsigned int num_elems_read_per_iteration = 16;
344 constexpr unsigned int num_elems_written_per_iteration = 8;
345 constexpr unsigned int num_rows_read_per_iteration = block_size;
346
347 // Configure kernel window
348 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
349 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
350
351 update_window_and_padding(win,
352 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
353 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
354 output_access);
355
356 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
357 input2->info()->valid_region());
358
359 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
360
361 INEKernel::configure(win);
362}
363
Anthony Barbierac69aa12017-07-03 17:39:37 +0100364#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100365
366template class arm_compute::NEHarrisScoreKernel<3>;
367template class arm_compute::NEHarrisScoreKernel<5>;
368template class arm_compute::NEHarrisScoreKernel<7>;
369template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
370template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
371template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
372
373namespace
374{
375inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
376{
377 // Trace^2
378 float32x4_t trace2 = vaddq_f32(gx2, gy2);
379 trace2 = vmulq_f32(trace2, trace2);
380
381 // Det(A)
382 float32x4_t det = vmulq_f32(gx2, gy2);
383 det = vmlsq_f32(det, gxgy, gxgy);
384
385 // Det(A) - sensitivity * trace^2
386 const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
387
388 // mc > strength_thresh
389 const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
390
391 return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
392}
393
394inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
395 float32x4_t norm_factor)
396{
397 // Normalize
398 low_gx = vmulq_f32(low_gx, norm_factor);
399 low_gy = vmulq_f32(low_gy, norm_factor);
400 high_gx = vmulq_f32(high_gx, norm_factor);
401 high_gy = vmulq_f32(high_gy, norm_factor);
402
403 const float32x4_t l_gx = low_gx;
404 const float32x4_t l_gy = low_gy;
405 const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
406 const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
407 const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
408 const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
409
410 // Gx*Gx
411 gx2 = vmlaq_f32(gx2, l_gx, l_gx);
412 gx2 = vmlaq_f32(gx2, m_gx, m_gx);
413 gx2 = vmlaq_f32(gx2, r_gx, r_gx);
414
415 // Gy*Gy
416 gy2 = vmlaq_f32(gy2, l_gy, l_gy);
417 gy2 = vmlaq_f32(gy2, m_gy, m_gy);
418 gy2 = vmlaq_f32(gy2, r_gy, r_gy);
419
420 // Gx*Gy
421 gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
422 gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
423 gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
424}
425
426inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
427 float32x4_t norm_factor)
428{
429 // Normalize
430 low_gx = vmulq_f32(low_gx, norm_factor);
431 low_gy = vmulq_f32(low_gy, norm_factor);
432 high_gx = vmulq_f32(high_gx, norm_factor);
433 high_gy = vmulq_f32(high_gy, norm_factor);
434
435 // L2 values
436 float32x4_t gx = low_gx;
437 float32x4_t gy = low_gy;
438
439 // Accumulate
440 gx2 = vmlaq_f32(gx2, gx, gx);
441 gy2 = vmlaq_f32(gy2, gy, gy);
442 gxgy = vmlaq_f32(gxgy, gx, gy);
443
444 // L1 values
445 gx = vextq_f32(low_gx, high_gx, 1);
446 gy = vextq_f32(low_gy, high_gy, 1);
447
448 // Accumulate
449 gx2 = vmlaq_f32(gx2, gx, gx);
450 gy2 = vmlaq_f32(gy2, gy, gy);
451 gxgy = vmlaq_f32(gxgy, gx, gy);
452
453 // M values
454 gx = vextq_f32(low_gx, high_gx, 2);
455 gy = vextq_f32(low_gy, high_gy, 2);
456
457 // Accumulate
458 gx2 = vmlaq_f32(gx2, gx, gx);
459 gy2 = vmlaq_f32(gy2, gy, gy);
460 gxgy = vmlaq_f32(gxgy, gx, gy);
461
462 // R1 values
463 gx = vextq_f32(low_gx, high_gx, 3);
464 gy = vextq_f32(low_gy, high_gy, 3);
465
466 // Accumulate
467 gx2 = vmlaq_f32(gx2, gx, gx);
468 gy2 = vmlaq_f32(gy2, gy, gy);
469 gxgy = vmlaq_f32(gxgy, gx, gy);
470
471 // R2 values
472 gx = high_gx;
473 gy = high_gy;
474
475 // Accumulate
476 gx2 = vmlaq_f32(gx2, gx, gx);
477 gy2 = vmlaq_f32(gy2, gy, gy);
478 gxgy = vmlaq_f32(gxgy, gx, gy);
479}
480
481inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
482 float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
483{
484 // Normalize
485 low_gx = vmulq_f32(low_gx, norm_factor);
486 low_gy = vmulq_f32(low_gy, norm_factor);
487 high_gx = vmulq_f32(high_gx, norm_factor);
488 high_gy = vmulq_f32(high_gy, norm_factor);
489
490 // L3 values
491 float32x4_t gx = low_gx;
492 float32x4_t gy = low_gy;
493
494 // Accumulate
495 gx2 = vmlaq_f32(gx2, gx, gx);
496 gy2 = vmlaq_f32(gy2, gy, gy);
497 gxgy = vmlaq_f32(gxgy, gx, gy);
498
499 // L2 values
500 gx = vextq_f32(low_gx, high_gx, 1);
501 gy = vextq_f32(low_gy, high_gy, 1);
502
503 // Accumulate
504 gx2 = vmlaq_f32(gx2, gx, gx);
505 gy2 = vmlaq_f32(gy2, gy, gy);
506 gxgy = vmlaq_f32(gxgy, gx, gy);
507
508 // L1 values
509 gx = vextq_f32(low_gx, high_gx, 2);
510 gy = vextq_f32(low_gy, high_gy, 2);
511
512 // Accumulate
513 gx2 = vmlaq_f32(gx2, gx, gx);
514 gy2 = vmlaq_f32(gy2, gy, gy);
515 gxgy = vmlaq_f32(gxgy, gx, gy);
516
517 // M values
518 gx = vextq_f32(low_gx, high_gx, 3);
519 gy = vextq_f32(low_gy, high_gy, 3);
520
521 // Accumulate
522 gx2 = vmlaq_f32(gx2, gx, gx);
523 gy2 = vmlaq_f32(gy2, gy, gy);
524 gxgy = vmlaq_f32(gxgy, gx, gy);
525
526 // R1 values
527 gx = high_gx;
528 gy = high_gy;
529
530 // Accumulate
531 gx2 = vmlaq_f32(gx2, gx, gx);
532 gy2 = vmlaq_f32(gy2, gy, gy);
533 gxgy = vmlaq_f32(gxgy, gx, gy);
534
535 // Change tmp_low and tmp_high for calculating R2 and R3 values
536 low_gx = high_gx;
537 low_gy = high_gy;
538 high_gx = high_gx1;
539 high_gy = high_gy1;
540
541 // Normalize
542 high_gx = vmulq_f32(high_gx, norm_factor);
543 high_gy = vmulq_f32(high_gy, norm_factor);
544
545 // R2 values
546 gx = vextq_f32(low_gx, high_gx, 1);
547 gy = vextq_f32(low_gy, high_gy, 1);
548
549 // Accumulate
550 gx2 = vmlaq_f32(gx2, gx, gx);
551 gy2 = vmlaq_f32(gy2, gy, gy);
552 gxgy = vmlaq_f32(gxgy, gx, gy);
553
554 // R3 values
555 gx = vextq_f32(low_gx, high_gx, 2);
556 gy = vextq_f32(low_gy, high_gy, 2);
557
558 // Accumulate
559 gx2 = vmlaq_f32(gx2, gx, gx);
560 gy2 = vmlaq_f32(gy2, gy, gy);
561 gxgy = vmlaq_f32(gxgy, gx, gy);
562}
563
564inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
565 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
566
567{
568 const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
569 const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
570 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
571 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
572 const auto output = static_cast<float *__restrict>(output_ptr);
573
574 // Gx^2, Gy^2 and Gx*Gy
575 float32x4x2_t gx2 =
576 {
577 {
578 vdupq_n_f32(0.0f),
579 vdupq_n_f32(0.0f)
580 }
581 };
582 float32x4x2_t gy2 =
583 {
584 {
585 vdupq_n_f32(0.0f),
586 vdupq_n_f32(0.0f)
587 }
588 };
589 float32x4x2_t gxgy =
590 {
591 {
592 vdupq_n_f32(0.0f),
593 vdupq_n_f32(0.0f)
594 }
595 };
596
597 // Row0
598 int16x8x2_t tmp_gx =
599 {
600 {
601 vld1q_s16(gx_ptr_0 - input_stride),
602 vld1q_s16(gx_ptr_1 - input_stride)
603 }
604 };
605 int16x8x2_t tmp_gy =
606 {
607 {
608 vld1q_s16(gy_ptr_0 - input_stride),
609 vld1q_s16(gy_ptr_1 - input_stride)
610 }
611 };
612 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
613 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
614 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
615
616 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
617 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
618 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
619 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
620 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
621
622 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
623 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
624 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
625 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
626 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
627
628 // Row1
629 tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
630 tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
631 tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
632 tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
633
634 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
635 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
636 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
637 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
638 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
639
640 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
641 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
642 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
643 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
644 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
645
646 // Row2
647 tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
648 tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
649 tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
650 tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
651
652 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
653 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
654 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
655 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
656 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
657
658 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
659 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
660 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
661 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
662 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
663
664 // Calculate harris score
665 const float32x4x2_t mc =
666 {
667 {
668 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
669 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
670 }
671 };
672
673 // Store score
674 vst1q_f32(output + 0, mc.val[0]);
675 vst1q_f32(output + 4, mc.val[1]);
676}
677
678inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
679 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
680{
681 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
682 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
683 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
684 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
685 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
686 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
687 const auto output = static_cast<float *__restrict>(output_ptr);
688 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
689 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
690 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
691
692 // Gx^2, Gy^2 and Gx*Gy
693 float32x4x2_t gx2 =
694 {
695 {
696 vdupq_n_f32(0.0f),
697 vdupq_n_f32(0.0f)
698 }
699 };
700 float32x4x2_t gy2 =
701 {
702 {
703 vdupq_n_f32(0.0f),
704 vdupq_n_f32(0.0f)
705 }
706 };
707 float32x4x2_t gxgy =
708 {
709 {
710 vdupq_n_f32(0.0f),
711 vdupq_n_f32(0.0f)
712 }
713 };
714
715 // Row0
716 float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
717 float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
718 float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
719 float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
720 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
721
722 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
723 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
724 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
725 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
726 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
727
728 // Row1
729 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
730 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
731 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
732 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
733 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
734
735 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
736 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
737 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
738 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
739 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
740
741 // Row2
742 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
743 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
744 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
745 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
746 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
747
748 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
749 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
750 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
751 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
752 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
753
754 // Calculate harris score
755 const float32x4x2_t mc =
756 {
757 {
758 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
759 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
760 }
761 };
762
763 // Store score
764 vst1q_f32(output + 0, mc.val[0]);
765 vst1q_f32(output + 4, mc.val[1]);
766}
767
768inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
769 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
770{
771 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
772 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
773 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
774 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
775 const auto output = static_cast<float *__restrict>(output_ptr);
776
777 // Gx^2, Gy^2 and Gx*Gy
778 float32x4x2_t gx2 =
779 {
780 {
781 vdupq_n_f32(0.0f),
782 vdupq_n_f32(0.0f)
783 }
784 };
785 float32x4x2_t gy2 =
786 {
787 {
788 vdupq_n_f32(0.0f),
789 vdupq_n_f32(0.0f)
790 }
791 };
792 float32x4x2_t gxgy =
793 {
794 {
795 vdupq_n_f32(0.0f),
796 vdupq_n_f32(0.0f)
797 }
798 };
799 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
800 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
801 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
802
803 for(int i = 0; i < 5; ++i)
804 {
805 const int16x8x2_t tmp_gx =
806 {
807 {
808 vld1q_s16(gx_ptr_0),
809 vld1q_s16(gx_ptr_1)
810 }
811 };
812 const int16x8x2_t tmp_gy =
813 {
814 {
815 vld1q_s16(gy_ptr_0),
816 vld1q_s16(gy_ptr_1)
817 }
818 };
819
820 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
821 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
822 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
823 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
824 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
825
826 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
827 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
828 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
829 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
830 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
831
832 // Update gx and gy pointer
833 gx_ptr_0 += input_stride;
834 gy_ptr_0 += input_stride;
835 gx_ptr_1 += input_stride;
836 gy_ptr_1 += input_stride;
837 }
838
839 // Calculate harris score
840 const float32x4x2_t mc =
841 {
842 {
843 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
844 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
845 }
846 };
847
848 // Store score
849 vst1q_f32(output + 0, mc.val[0]);
850 vst1q_f32(output + 4, mc.val[1]);
851}
852
853inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
854 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
855
856{
857 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
858 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
859 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
860 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
861 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
862 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
863 const auto output = static_cast<float *__restrict>(output_ptr);
864
865 // Gx^2, Gy^2 and Gx*Gy
866 float32x4x2_t gx2 =
867 {
868 {
869 vdupq_n_f32(0.0f),
870 vdupq_n_f32(0.0f)
871 }
872 };
873 float32x4x2_t gy2 =
874 {
875 {
876 vdupq_n_f32(0.0f),
877 vdupq_n_f32(0.0f)
878 }
879 };
880 float32x4x2_t gxgy =
881 {
882 {
883 vdupq_n_f32(0.0f),
884 vdupq_n_f32(0.0f)
885 }
886 };
887 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
888 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
889 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
890
891 for(int i = 0; i < 5; ++i)
892 {
893 const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
894 const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
895 const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
896 const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
897 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
898
899 const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
900 const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
901 const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
902 const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
903 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
904
905 // Update gx and gy pointer
906 gx_ptr_0 += input_stride;
907 gy_ptr_0 += input_stride;
908 gx_ptr_1 += input_stride;
909 gy_ptr_1 += input_stride;
910 gx_ptr_2 += input_stride;
911 gy_ptr_2 += input_stride;
912 }
913
914 // Calculate harris score
915 const float32x4x2_t mc =
916 {
917 {
918 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
919 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
920 }
921 };
922
923 // Store score
924 vst1q_f32(output + 0, mc.val[0]);
925 vst1q_f32(output + 4, mc.val[1]);
926}
927
928inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
929 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
930{
931 auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
932 auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
933 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
934 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
935 const auto output = static_cast<float *__restrict>(output_ptr);
936
937 // Gx^2, Gy^2 and Gx*Gy
938 float32x4_t gx2 = vdupq_n_f32(0.0f);
939 float32x4_t gy2 = vdupq_n_f32(0.0f);
940 float32x4_t gxgy = vdupq_n_f32(0.0f);
941 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
942 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
943 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
944
945 for(int i = 0; i < 7; ++i)
946 {
947 const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
948 const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
949 const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
950 const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
951
952 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
953 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
954 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
955 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
956 float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
957 float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
958 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
959
960 // Update gx and gy pointer
961 gx_ptr_0 += input_stride;
962 gy_ptr_0 += input_stride;
963 gx_ptr_1 += input_stride;
964 gy_ptr_1 += input_stride;
965 }
966
967 // Calculate harris score
968 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
969
970 // Store score
971 vst1q_f32(output, mc);
972}
973
974inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
975 float in_norm_factor, float in_sensitivity, float in_strength_thresh)
976{
977 auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
978 auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
979 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
980 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
981 const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
982 const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
983 const auto output = static_cast<float *__restrict>(output_ptr);
984
985 // Gx^2, Gy^2 and Gx*Gy
986 float32x4_t gx2 = vdupq_n_f32(0.0f);
987 float32x4_t gy2 = vdupq_n_f32(0.0f);
988 float32x4_t gxgy = vdupq_n_f32(0.0f);
989 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
990 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
991 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
992
993 for(int i = 0; i < 7; ++i)
994 {
995 const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
996 const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
997 const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
998 const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
999 const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
1000 const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
1001 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
1002
1003 // Update gx and gy pointer
1004 gx_ptr_0 += input_stride;
1005 gy_ptr_0 += input_stride;
1006 gx_ptr_1 += input_stride;
1007 gy_ptr_1 += input_stride;
1008 gx_ptr_2 += input_stride;
1009 gy_ptr_2 += input_stride;
1010 }
1011
1012 // Calculate harris score
1013 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
1014
1015 // Store score
1016 vst1q_f32(output, mc);
1017}
1018
1019} // namespace
1020
1021INEHarrisScoreKernel::INEHarrisScoreKernel()
1022 : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
1023{
1024}
1025
1026template <int32_t block_size>
1027NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
1028 : INEHarrisScoreKernel(), _func(nullptr)
1029{
1030}
1031
1032template <int32_t block_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001033void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001034{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001035 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001036 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1037 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1038 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1039
1040 Iterator input1(_input1, window);
1041 Iterator input2(_input2, window);
1042 Iterator output(_output, window);
1043
1044 const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
1045
1046 execute_window_loop(window, [&](const Coordinates & id)
1047 {
1048 (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
1049 },
1050 input1, input2, output);
1051}
1052
1053template <int32_t block_size>
1054BorderSize NEHarrisScoreKernel<block_size>::border_size() const
1055{
1056 return _border_size;
1057}
1058
1059template <int32_t block_size>
1060void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
1061 bool border_undefined)
1062{
1063 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
1064 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
1065 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
1066 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
1067 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
1068 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
1069 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
1070 ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
1071
1072 _input1 = input1;
1073 _input2 = input2;
1074 _output = output;
1075 _sensitivity = sensitivity;
1076 _strength_thresh = strength_thresh;
1077 _norm_factor = norm_factor;
1078 _border_size = BorderSize(block_size / 2);
1079
1080 if(input1->info()->data_type() == DataType::S16)
1081 {
1082 switch(block_size)
1083 {
1084 case 3:
1085 _func = &harris_score3x3_S16_S16_FLOAT;
1086 break;
1087 case 5:
1088 _func = &harris_score5x5_S16_S16_FLOAT;
1089 break;
1090 case 7:
1091 _func = &harris_score7x7_S16_S16_FLOAT;
1092 break;
1093 default:
1094 ARM_COMPUTE_ERROR("Invalid block size");
1095 break;
1096 }
1097 }
1098 else
1099 {
1100 switch(block_size)
1101 {
1102 case 3:
1103 _func = &harris_score3x3_S32_S32_FLOAT;
1104 break;
1105 case 5:
1106 _func = &harris_score5x5_S32_S32_FLOAT;
1107 break;
1108 case 7:
1109 _func = &harris_score7x7_S32_S32_FLOAT;
1110 break;
1111 default:
1112 ARM_COMPUTE_ERROR("Invalid block size");
1113 break;
1114 }
1115 }
1116
1117 ARM_COMPUTE_ERROR_ON(nullptr == _func);
1118
1119 constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
1120 constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
1121 constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
1122 constexpr unsigned int num_rows_read_per_iteration = block_size;
1123
1124 // Configure kernel window
1125 Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
1126 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
1127
1128 update_window_and_padding(win,
1129 AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
1130 AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
1131 output_access);
1132
1133 ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
1134 input2->info()->valid_region());
1135
1136 output_access.set_valid_region(win, valid_region, border_undefined, border_size());
1137
1138 INEKernel::configure(win);
1139}