blob: 919efd2ae2658375821ea31b19703d1c156a9534 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/Validate.h"
30
31#include <algorithm>
32#include <arm_neon.h>
33#include <cstddef>
34#include <limits>
35
36using namespace arm_compute;
37
38NEFastCornersKernel::NEFastCornersKernel()
39 : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
40{
41}
42
43namespace
44{
45constexpr size_t PERMUTATIONS = 16;
46constexpr size_t PERM_SIZE = 16;
47
48inline uint8x8x2_t create_permutation_index(size_t k)
49{
50 ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
51
52 static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE]
53 {
54 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
55 { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
56 { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
57 { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
58 { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
59 { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
60 { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
61 { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
62 { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
63 { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
64 { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
65 { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
66 { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
67 { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
68 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
69 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
70
71 };
72
73 const uint8x8x2_t index =
74 {
75 {
76 vld1_u8(permutations_table[k]),
77 vld1_u8(permutations_table[k] + 8)
78 }
79 };
80
81 return index;
82}
83
84inline uint8x8x4_t create_circle_index_register()
85{
86 /*
87 This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
88
89 . . F 0 1 . . .
90 . E . . . 2 . .
91 D . . . . . 3 .
92 C . . P . . 4 .
93 B . . . . . 5 .
94 . A . . . 6 . .
95 . . 9 8 7 . . .
96
97 Where . is an irrelevant texel value
98
99 We want to retrieve all texels [0,F]
100
101 The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
102
103 The first table holds the top 4 rows of texels
104 . . F 0 1 . . .
105 . E . . . 2 . .
106 D . . . . . 3 .
107 C . . P . . 4 .
108
109 The second table the bottom 3 rows of texels
110 B . . . . . 5 .
111 . A . . . 6 . .
112 . . 9 8 7 . . .
113
114 */
115 static const uint8_t top_right[8] =
116 {
117 /* The register r.val[0] will be used to retrieve these texels:
118 . . . 0 1 . . .
119 . . . . . 2 . .
120 . . . . . . 3 .
121 . . . . . . 4 .
122 */
123 3 /* top table, first row, elem 4, value 0 in the diagram above */,
124 4 /* top table, first row, elem 5, value 1 in the diagram above */,
125 13 /* top table, second row, elem 6, value 2 in the diagram above */,
126 22 /* top table, third row, elem 7, value 3 in the diagram above*/,
127 30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
128 255,
129 255,
130 255
131 };
132
133 static const uint8_t bottom_right[8] =
134 {
135 /* The register r.val[1] will be used to retrieve these texels:
136 . . . . . . 5 .
137 . . . . . 6 . .
138 . . . . 7 . . .
139 */
140 255,
141 255,
142 255,
143 255,
144 255,
145 6 /* low table, first row, elem 7, value 5 in the diagram above*/,
146 13 /* low table, second row, elem 6, value 6 in the diagram above*/,
147 20 /* low table, third row, elem 5, value 7 in the diagram above*/
148 };
149
150 static const uint8_t top_left[8] =
151 {
152 /* The register r.val[2] will be used to retrieve these texels:
153 . . F . . . . .
154 . E . . . . . .
155 D . . . . . . .
156 C . . . . . . .
157 */
158 255,
159 255,
160 255,
161 255,
162 24 /* top table, fourth row, elem 1, value C in the diagram above */,
163 16 /* top table, third row, elem 1, value D in the diagram above*/,
164 9 /* top table, second row, elem 2, value E in the diagram above*/,
165 2 /* top table, first row, elem 3, value F in the diagram above*/
166 };
167
168 static const uint8_t bottom_left[8] =
169 {
170 /* The register r.val[3] will be used to retrieve these texels:
171 B . . . . . . .
172 . A . . . . . .
173 . . 9 8 . . . .
174 */
175 19 /* low table, third row, elem 4, value 8 in the diagram above */,
176 18 /* low table, third row, elem 3, value 9 in the diagram above */,
177 9 /* low table, second row, elem 2, value A in the diagram above */,
178 0 /* low table, first row, elem 1, value B in the diagram above */,
179 255,
180 255,
181 255,
182 255
183 };
184
185 const uint8x8x4_t reg =
186 {
187 {
188 vld1_u8(top_right),
189 vld1_u8(bottom_right),
190 vld1_u8(top_left),
191 vld1_u8(bottom_left)
192 }
193 };
194
195 return reg;
196}
197
198inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo)
199{
200 /*
201 This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'.
202 The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register().
203 tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P
204 */
205 return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
206 vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
207}
208
209inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle)
210{
211 /*
212 This function stores the 9 texels of a give permutation X in the neon register 'texels'
213
214 'tbl_circle' is a LUT with the texels 0 to F
215
216 . . F 0 1 . . .
217 . E . . . 2 . .
218 D . . . . . 3 .
219 C . . P . . 4 .
220 B . . . . . 5 .
221 . A . . . 6 . .
222 . . 9 8 7 . . .
223
224 'permutation_index' is one of the permutations below:
225
226 { 0, 1, 2, 3, 4, 5, 6, 7, 8},
227 { F, 0, 1, 2, 3, 4, 5, 6, 7},
228 { E, F, 0, 1, 2, 3, 4, 5, 6},
229 { D, E, F, 0, 1, 2, 3, 4, 5},
230 { C, D, E, F, 0, 1, 2, 3, 4},
231 { B, C, D, E, F, 0, 1, 2, 3},
232 { A, B, C, D, E, F, 0, 1, 2},
233 { 9, A, B, C, D, E, F, 0, 1},
234 { 8, 9, A, B, C, D, E, F, 0},
235 { 7, 8, 9, A, B, C, D, E, F},
236 { 6, 7, 8, 9, A, B, C, D, E},
237 { 5, 6, 7, 8, 9, A, B, C, D},
238 { 4, 5, 6, 7, 8, 9, A, B, C},
239 { 3, 4, 5, 6, 7, 8, 9, A, B},
240 { 2, 3, 4, 5, 6, 7, 8, 9, A},
241 { 1, 2, 3, 4, 5, 6, 7, 8, 9},
242 */
243 static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes
244
245 return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
246 vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
247}
248
249inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg)
250{
251 const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
252
253 return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
254}
255
256inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl)
257{
258 const uint8x16_t res_lt = vcltq_u8(permutation, pl);
259 const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
260 const uint64_t t3 = vgetq_lane_u64(u64res_lt, 0);
261 const uint64_t t4 = vgetq_lane_u64(u64res_lt, 1);
262
263 return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
264}
265
266inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl)
267{
268 return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
269}
270
271inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS])
272{
273 /*
274 This function determines whether the point 'p' is a corner.
275 */
276 uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
277 uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
278
279 bool corner_detected = false;
280
281 for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
282 {
283 const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
284 corner_detected = is_permutation_corner(pe_texels, pg, pl);
285 }
286
287 return corner_detected;
288}
289
290inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r)
291{
292 /*
293 This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
294 circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above.
295 */
296
297 //Load the texels in the window [(x-3,y-3),(x+3,y+3)].
298 //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo.
299 //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3.
300 const uint8x8x4_t tbl_window_hi =
301 {
302 {
303 vld1_u8(buffer[0] + in_offset),
304 vld1_u8(buffer[1] + in_offset),
305 vld1_u8(buffer[2] + in_offset),
306 vld1_u8(buffer[3] + in_offset)
307 }
308 };
309
310 const uint8x8x3_t tbl_window_lo =
311 {
312 {
313 vld1_u8(buffer[4] + in_offset),
314 vld1_u8(buffer[5] + in_offset),
315 vld1_u8(buffer[6] + in_offset)
316 }
317 };
318
319 const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
320
321 const uint8x8x2_t tbl_circle_texels =
322 {
323 {
324 vget_low_u8(circle_texels),
325 vget_high_u8(circle_texels)
326 }
327 };
328
329 return tbl_circle_texels;
330}
331
332inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS])
333{
334 uint8_t b = 255;
335 uint8_t a = tolerance;
336
337 while(b - a > 1)
338 {
339 const uint16_t ab = a + b;
340 const uint8_t c = ab >> 1;
341
342 if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
343 {
344 a = c;
345 }
346 else
347 {
348 b = c;
349 }
350 }
351
352 return a;
353}
354} // namespace
355
356BorderSize NEFastCornersKernel::border_size() const
357{
358 return BorderSize(3);
359}
360
361void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
362{
363 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
364 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
365 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
366 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
367 ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented");
368
369 _input = input;
370 _output = output;
371 _threshold = threshold;
372 _non_max_suppression = non_max_suppression;
373
374 constexpr unsigned int num_elems_processed_per_iteration = 1;
375 constexpr unsigned int num_elems_read_per_iteration = 8;
376 constexpr unsigned int num_elems_written_per_iteration = 1;
377 constexpr unsigned int num_rows_read_per_iteration = 7;
378
379 // Configure kernel window
380 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
381 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
382 AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
383
384 update_window_and_padding(win, input_access, output_access);
385
386 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
387
388 INEKernel::configure(win);
389}
390
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100391void NEFastCornersKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100392{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100393 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100394 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
395 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
396
397 std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
398 /*
399 We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t.
400 The three lines below setup the neon index registers to get these texels out from the table
401 */
402 const uint8x8x4_t circle_index_r = create_circle_index_register();
403 /*
404 We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation.
405 */
406 for(size_t k = 0; k < PERMUTATIONS; ++k)
407 {
408 perm_index[k] = create_permutation_index(k);
409 }
410
411 Iterator in(_input, window);
412 Iterator out(_output, window);
413
414 const uint8_t *const __restrict in_row[7] =
415 {
416 _input->ptr_to_element(Coordinates(-3, -3)),
417 _input->ptr_to_element(Coordinates(-3, -2)),
418 _input->ptr_to_element(Coordinates(-3, -1)),
419 _input->ptr_to_element(Coordinates(-3, 0)),
420 _input->ptr_to_element(Coordinates(-3, 1)),
421 _input->ptr_to_element(Coordinates(-3, 2)),
422 _input->ptr_to_element(Coordinates(-3, 3))
423 };
424
425 auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b)
426 {
427 const bool p_is_in_ab = (a <= p) && (p <= b);
428 const bool q_is_in_ab = (a <= q) && (q <= b);
429 return p_is_in_ab && q_is_in_ab;
430 };
431
432 execute_window_loop(window, [&](const Coordinates & id)
433 {
434 const size_t in_offset = in.offset();
435 const uint8_t p0 = *in.ptr();
436 const uint8_t b = std::min(p0 + _threshold, 255);
437 const uint8_t a = std::max(p0 - _threshold, 0);
438 uint8_t score = 0;
439 /*
440 Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations
441
442 pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner.
443 */
444 const uint8_t p1 = (in_offset + in_row[0])[3];
445 const uint8_t p9 = (in_offset + in_row[6])[3];
446
447 if(!is_rejected(p1, p9, a, b))
448 {
449 /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */
450 const uint8_t p5 = (in_offset + in_row[3])[6];
451 const uint8_t p13 = (in_offset + in_row[3])[0];
452
453 if(!is_rejected(p5, p13, a, b))
454 {
455 /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
456 const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
457
458 if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data()))
459 {
460 if(_non_max_suppression)
461 {
462 score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data());
463 }
464 else
465 {
466 score = 1;
467 }
468 }
469 }
470 }
471
472 *out.ptr() = score;
473 },
474 in, out);
475}