blob: 58c0acd4047fd049a7c534e73c262ebffa1985f2 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <algorithm>
36#include <arm_neon.h>
37#include <array>
38#include <tuple>
39#include <utility>
40
41namespace arm_compute
42{
43namespace
44{
45const uint8x16_t zero_u8 = vdupq_n_u8(0);
46
47template <size_t columns>
48inline uint8x8_t min_row(uint8x16_t row_data)
49{
50 uint8x8_t min = vget_low_u8(row_data);
51
52 for(size_t c = 1; c < columns; ++c)
53 {
54 row_data = vextq_u8(row_data, zero_u8, 1);
55 min = vmin_u8(min, vget_low_u8(row_data));
56 }
57
58 return min;
59}
60
61template <size_t columns>
62inline uint8x8_t max_row(uint8x16_t row_data)
63{
64 uint8x8_t max = vget_low_u8(row_data);
65
66 for(size_t c = 1; c < columns; ++c)
67 {
68 row_data = vextq_u8(row_data, zero_u8, 1);
69 max = vmax_u8(max, vget_low_u8(row_data));
70 }
71
72 return max;
73}
74
75inline void sort(uint8x8_t &a, uint8x8_t &b)
76{
77 const uint8x8_t min = vmin_u8(a, b);
78 const uint8x8_t max = vmax_u8(a, b);
79 a = min;
80 b = max;
81}
82
83// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
84// Calculations that do not affect the median were removed.
85inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
86{
87 sort(p0, p1);
88 sort(p2, p3);
89 sort(p0, p2);
90 sort(p1, p3);
91 sort(p1, p2);
92 sort(p0, p4);
93 sort(p1, p4);
94 sort(p2, p4);
95}
96
97inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
98 uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
99 uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
100{
101 sort(p1, p2);
102 sort(p4, p5);
103 sort(p7, p8);
104 sort(p0, p1);
105 sort(p3, p4);
106 sort(p6, p7);
107 sort(p1, p2);
108 sort(p4, p5);
109 sort(p7, p8);
110 sort(p0, p3);
111 sort(p5, p8);
112 sort(p4, p7);
113 sort(p3, p6);
114 sort(p1, p4);
115 sort(p2, p5);
116 sort(p4, p7);
117 sort(p4, p2);
118 sort(p6, p4);
119 sort(p4, p2);
120}
121
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100122inline void sort21(std::array<uint8x8_t, 21> &p)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100123{
124 sort(p[0], p[1]);
125 sort(p[2], p[3]);
126 sort(p[4], p[5]);
127 sort(p[6], p[7]);
128 sort(p[8], p[9]);
129 sort(p[10], p[11]);
130 sort(p[12], p[13]);
131 sort(p[14], p[15]);
132 sort(p[16], p[17]);
133 sort(p[18], p[19]);
134 sort(p[0], p[2]);
135 sort(p[1], p[3]);
136 sort(p[4], p[6]);
137 sort(p[5], p[7]);
138 sort(p[8], p[10]);
139 sort(p[9], p[11]);
140 sort(p[12], p[14]);
141 sort(p[13], p[15]);
142 sort(p[16], p[18]);
143 sort(p[17], p[19]);
144 sort(p[1], p[2]);
145 sort(p[5], p[6]);
146 sort(p[0], p[4]);
147 sort(p[3], p[7]);
148 sort(p[9], p[10]);
149 sort(p[13], p[14]);
150 sort(p[8], p[12]);
151 sort(p[11], p[15]);
152 sort(p[17], p[18]);
153 sort(p[16], p[20]);
154 sort(p[1], p[5]);
155 sort(p[2], p[6]);
156 sort(p[9], p[13]);
157 sort(p[10], p[14]);
158 sort(p[0], p[8]);
159 sort(p[7], p[15]);
160 sort(p[17], p[20]);
161 sort(p[1], p[4]);
162 sort(p[3], p[6]);
163 sort(p[9], p[12]);
164 sort(p[11], p[14]);
165 sort(p[18], p[20]);
166 sort(p[0], p[16]);
167 sort(p[2], p[4]);
168 sort(p[3], p[5]);
169 sort(p[10], p[12]);
170 sort(p[11], p[13]);
171 sort(p[1], p[9]);
172 sort(p[6], p[14]);
173 sort(p[19], p[20]);
174 sort(p[3], p[4]);
175 sort(p[11], p[12]);
176 sort(p[1], p[8]);
177 sort(p[2], p[10]);
178 sort(p[5], p[13]);
179 sort(p[7], p[14]);
180 sort(p[3], p[11]);
181 sort(p[2], p[8]);
182 sort(p[4], p[12]);
183 sort(p[7], p[13]);
184 sort(p[1], p[17]);
185 sort(p[3], p[10]);
186 sort(p[5], p[12]);
187 sort(p[1], p[16]);
188 sort(p[2], p[18]);
189 sort(p[3], p[9]);
190 sort(p[6], p[12]);
191 sort(p[2], p[16]);
192 sort(p[3], p[8]);
193 sort(p[7], p[12]);
194 sort(p[5], p[9]);
195 sort(p[6], p[10]);
196 sort(p[4], p[8]);
197 sort(p[7], p[11]);
198 sort(p[3], p[19]);
199 sort(p[5], p[8]);
200 sort(p[7], p[10]);
201 sort(p[3], p[18]);
202 sort(p[4], p[20]);
203 sort(p[6], p[8]);
204 sort(p[7], p[9]);
205 sort(p[3], p[17]);
206 sort(p[5], p[20]);
207 sort(p[7], p[8]);
208 sort(p[3], p[16]);
209 sort(p[6], p[20]);
210 sort(p[5], p[17]);
211 sort(p[7], p[20]);
212 sort(p[4], p[16]);
213 sort(p[6], p[18]);
214 sort(p[5], p[16]);
215 sort(p[7], p[19]);
216 sort(p[7], p[18]);
217 sort(p[6], p[16]);
218 sort(p[7], p[17]);
219 sort(p[10], p[18]);
220 sort(p[7], p[16]);
221 sort(p[9], p[17]);
222 sort(p[8], p[16]);
223 sort(p[9], p[16]);
224 sort(p[10], p[16]);
225}
226
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100227inline void sort25(std::array<uint8x8_t, 25> &p)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100228{
229 sort(p[1], p[2]);
230 sort(p[0], p[1]);
231 sort(p[1], p[2]);
232 sort(p[4], p[5]);
233 sort(p[3], p[4]);
234 sort(p[4], p[5]);
235 sort(p[0], p[3]);
236 sort(p[2], p[5]);
237 sort(p[2], p[3]);
238 sort(p[1], p[4]);
239 sort(p[1], p[2]);
240 sort(p[3], p[4]);
241 sort(p[7], p[8]);
242 sort(p[6], p[7]);
243 sort(p[7], p[8]);
244 sort(p[10], p[11]);
245 sort(p[9], p[10]);
246 sort(p[10], p[11]);
247 sort(p[6], p[9]);
248 sort(p[8], p[11]);
249 sort(p[8], p[9]);
250 sort(p[7], p[10]);
251 sort(p[7], p[8]);
252 sort(p[9], p[10]);
253 sort(p[0], p[6]);
254 sort(p[4], p[10]);
255 sort(p[4], p[6]);
256 sort(p[2], p[8]);
257 sort(p[2], p[4]);
258 sort(p[6], p[8]);
259 sort(p[1], p[7]);
260 sort(p[5], p[11]);
261 sort(p[5], p[7]);
262 sort(p[3], p[9]);
263 sort(p[3], p[5]);
264 sort(p[7], p[9]);
265 sort(p[1], p[2]);
266 sort(p[3], p[4]);
267 sort(p[5], p[6]);
268 sort(p[7], p[8]);
269 sort(p[9], p[10]);
270 sort(p[13], p[14]);
271 sort(p[12], p[13]);
272 sort(p[13], p[14]);
273 sort(p[16], p[17]);
274 sort(p[15], p[16]);
275 sort(p[16], p[17]);
276 sort(p[12], p[15]);
277 sort(p[14], p[17]);
278 sort(p[14], p[15]);
279 sort(p[13], p[16]);
280 sort(p[13], p[14]);
281 sort(p[15], p[16]);
282 sort(p[19], p[20]);
283 sort(p[18], p[19]);
284 sort(p[19], p[20]);
285 sort(p[21], p[22]);
286 sort(p[23], p[24]);
287 sort(p[21], p[23]);
288 sort(p[22], p[24]);
289 sort(p[22], p[23]);
290 sort(p[18], p[21]);
291 sort(p[20], p[23]);
292 sort(p[20], p[21]);
293 sort(p[19], p[22]);
294 sort(p[22], p[24]);
295 sort(p[19], p[20]);
296 sort(p[21], p[22]);
297 sort(p[23], p[24]);
298 sort(p[12], p[18]);
299 sort(p[16], p[22]);
300 sort(p[16], p[18]);
301 sort(p[14], p[20]);
302 sort(p[20], p[24]);
303 sort(p[14], p[16]);
304 sort(p[18], p[20]);
305 sort(p[22], p[24]);
306 sort(p[13], p[19]);
307 sort(p[17], p[23]);
308 sort(p[17], p[19]);
309 sort(p[15], p[21]);
310 sort(p[15], p[17]);
311 sort(p[19], p[21]);
312 sort(p[13], p[14]);
313 sort(p[15], p[16]);
314 sort(p[17], p[18]);
315 sort(p[19], p[20]);
316 sort(p[21], p[22]);
317 sort(p[23], p[24]);
318 sort(p[0], p[12]);
319 sort(p[8], p[20]);
320 sort(p[8], p[12]);
321 sort(p[4], p[16]);
322 sort(p[16], p[24]);
323 sort(p[12], p[16]);
324 sort(p[2], p[14]);
325 sort(p[10], p[22]);
326 sort(p[10], p[14]);
327 sort(p[6], p[18]);
328 sort(p[6], p[10]);
329 sort(p[10], p[12]);
330 sort(p[1], p[13]);
331 sort(p[9], p[21]);
332 sort(p[9], p[13]);
333 sort(p[5], p[17]);
334 sort(p[13], p[17]);
335 sort(p[3], p[15]);
336 sort(p[11], p[23]);
337 sort(p[11], p[15]);
338 sort(p[7], p[19]);
339 sort(p[7], p[11]);
340 sort(p[11], p[13]);
341 sort(p[11], p[12]);
342}
343} // namespace
344
345NENonLinearFilterKernel::NENonLinearFilterKernel()
346 : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
347{
348}
349
350BorderSize NENonLinearFilterKernel::border_size() const
351{
352 return _border_size;
353}
354
355void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
356 bool border_undefined)
357{
358 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
359 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
360 ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
361 ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
362
363 // Set class variables
364 _border_size = BorderSize(mask_size / 2);
365 _input = input;
366 _output = output;
367 _mask = mask;
368 _pattern = pattern;
369 _function = function;
370
371 // Configure kernel window
372 const unsigned int num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
373 constexpr unsigned int num_elems_read_per_iteration = 16;
374
375 Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
376 AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
377 update_window_and_padding(win,
378 AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
379 output_access);
380 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
381
382 INEKernel::configure(win);
383
384 // Define function index
385 _func_idx = (3 == mask_size) ? 0 : 1;
386
387 if(MatrixPattern::OTHER != pattern)
388 {
389 _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
390 }
391}
392
393void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
394{
395 unsigned int v = 0;
396
397 for(int r = 0; r < rows; ++r)
398 {
399 for(int c = 0; c < cols; ++c, ++v)
400 {
401 uint8_t val = 0;
402
403 switch(pattern)
404 {
405 case MatrixPattern::BOX:
406 val = 255;
407 break;
408 case MatrixPattern::CROSS:
409 val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
410 break;
411 case MatrixPattern::DISK:
412 val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
413 (cols / 2.0f))) <= 1.0f ? 255 : 0;
414 break;
415 default:
416 return;
417 }
418
419 mask[v] = val;
420 }
421 }
422}
423
424template <>
425void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
426{
427 Iterator input(_input, win);
428 Iterator output(_output, win);
429
430 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
431 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
432 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
433
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100434 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100435 {
436 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
437 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
438 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
439
440 uint8x8_t p0 = vget_low_u8(top_data);
441 uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
442 uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
443 uint8x8_t p3 = vget_low_u8(mid_data);
444 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
445 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
446 uint8x8_t p6 = vget_low_u8(bot_data);
447 uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
448 uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
449
450 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
451
452 vst1_u8(output.ptr(), p4);
453 },
454 input, output);
455}
456template <>
457void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
458{
459 Iterator input(_input, win);
460 Iterator output(_output, win);
461
462 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
463 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
464 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
465 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
466 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
467
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100468 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100469 {
470 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
471 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
472 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
473 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
474 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
475
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100476 const std::array<uint8x8_t, 10> d =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100477 {
478 vget_low_u8(top2_data),
479 vget_high_u8(top2_data),
480 vget_low_u8(top_data),
481 vget_high_u8(top_data),
482 vget_low_u8(mid_data),
483 vget_high_u8(mid_data),
484 vget_low_u8(bot_data),
485 vget_high_u8(bot_data),
486 vget_low_u8(bot2_data),
487 vget_high_u8(bot2_data)
488 };
489
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100490 std::array<uint8x8_t, 25> p{ 0 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100491 for(unsigned int i = 0; i < 5; ++i)
492 {
493 const unsigned int idx_d = i * 2;
494 const unsigned int idx_p = i * 5;
495
496 p[idx_p] = d[idx_d];
497 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
498 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
499 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
500 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
501 }
502
503 sort25(p);
504
505 vst1_u8(output.ptr(), p[12]);
506 },
507 input, output);
508}
509
510template <int mask_w, int mask_h>
511void NENonLinearFilterKernel::min_filter_box(const Window &win)
512{
513 static_assert(mask_w > 0, "Mask size must not be 0");
514 static_assert(mask_h > 0, "Mask size must not be 0");
515
516 Iterator input(_input, win);
517 Iterator output(_output, win);
518
519 const int k_row_half = mask_h / 2;
520 const int k_col_half = mask_w / 2;
521
522 // Set row pointers
523 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
524 for(int i = -k_row_half; i <= k_row_half; ++i)
525 {
526 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
527 }
528
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100529 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100530 {
531 // Get min of rows
532 uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
533
534 for(unsigned int r = 1; r < mask_h; ++r)
535 {
536 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
537 rows_min = vminq_u8(rows_min, data);
538 }
539
540 const uint8x8_t out = min_row<mask_w>(rows_min);
541
542 // Store result as U8
543 vst1_u8(output.ptr(), out);
544 },
545 input, output);
546}
547
548template <int mask_w, int mask_h>
549void NENonLinearFilterKernel::max_filter_box(const Window &win)
550{
551 static_assert(mask_w > 0, "Mask size must not be 0");
552 static_assert(mask_h > 0, "Mask size must not be 0");
553 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
554
555 Iterator input(_input, win);
556 Iterator output(_output, win);
557
558 const int k_row_half = mask_h / 2;
559 const int k_col_half = mask_w / 2;
560
561 // Set row pointers
562 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
563 for(int i = -k_row_half; i <= k_row_half; ++i)
564 {
565 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
566 }
567
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100568 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100569 {
570 uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
571
572 // Get max of rows
573 for(unsigned int r = 1; r < mask_h; ++r)
574 {
575 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
576 rows_max = vmaxq_u8(rows_max, data);
577 }
578
579 // Get max of columns
580 const uint8x8_t out = max_row<mask_w>(rows_max);
581
582 // Store result as U8
583 vst1_u8(output.ptr(), out);
584 },
585 input, output);
586}
587
588template <>
589void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
590{
591 Iterator input(_input, win);
592 Iterator output(_output, win);
593
594 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
595 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
596 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
597
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100598 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100599 {
600 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
601 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
602 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
603
604 uint8x8_t p0 = top_data;
605 uint8x8_t p1 = vget_low_u8(mid_data);
606 uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
607 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
608 uint8x8_t p4 = bot_data;
609
610 sort5(p0, p1, p2, p3, p4);
611
612 vst1_u8(output.ptr(), p2);
613 },
614 input, output);
615}
616
617template <>
618void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
619{
620 Iterator input(_input, win);
621 Iterator output(_output, win);
622
623 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
624 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
625 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
626 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
627 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
628
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100629 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100630 {
631 const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.offset());
632 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
633 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
634 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
635 const uint8x8_t bot2_data = vld1_u8(input_bot2_ptr + input.offset());
636
637 uint8x8_t p0 = top2_data;
638 uint8x8_t p1 = top_data;
639 uint8x8_t p2 = vget_low_u8(mid_data);
640 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
641 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
642 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
643 uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
644 uint8x8_t p7 = bot_data;
645 uint8x8_t p8 = bot2_data;
646
647 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
648
649 vst1_u8(output.ptr(), p4);
650 },
651 input, output);
652}
653
654template <int mask_w, int mask_h>
655void NENonLinearFilterKernel::min_filter_cross(const Window &win)
656{
657 static_assert(mask_w > 0, "Mask size must not be 0");
658 static_assert(mask_h > 0, "Mask size must not be 0");
659 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
660
661 Iterator input(_input, win);
662 Iterator output(_output, win);
663
664 const int k_row_half = mask_h / 2;
665 const int k_col_half = mask_w / 2;
666
667 const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
668
669 // Set row pointers
670 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
671 for(int i = -k_row_half; i <= k_row_half; ++i)
672 {
673 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
674 }
675
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100676 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100677 {
678 uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
679
680 // Get min of rows
681 for(unsigned int r = 1; r < mask_h; ++r)
682 {
683 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
684 rows_min = vmin_u8(rows_min, data);
685 }
686
687 // Get min of middle row
688 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
689 uint8x8_t out = min_row<mask_w>(data);
690
691 // Get final min
692 out = vmin_u8(out, rows_min);
693
694 // Store result as U8
695 vst1_u8(output.ptr(), out);
696 },
697 input, output);
698}
699
700template <int mask_w, int mask_h>
701void NENonLinearFilterKernel::max_filter_cross(const Window &win)
702{
703 static_assert(mask_w > 0, "Mask size must not be 0");
704 static_assert(mask_h > 0, "Mask size must not be 0");
705 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
706
707 Iterator input(_input, win);
708 Iterator output(_output, win);
709
710 const int k_row_half = mask_h / 2;
711 const int k_col_half = mask_w / 2;
712
713 const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
714
715 // Set row pointers
716 std::array<unsigned char *, mask_h> input_ptrs{ {} };
717 for(int i = -k_row_half; i <= k_row_half; ++i)
718 {
719 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
720 }
721
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100722 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723 {
724 uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
725
726 // Get max of rows
727 for(unsigned int r = 1; r < mask_h; ++r)
728 {
729 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
730 rows_max = vmax_u8(rows_max, data);
731 }
732
733 // Get max of middle row
734 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
735 uint8x8_t out = max_row<mask_w>(data);
736
737 // Get final max
738 out = vmax_u8(out, rows_max);
739
740 // Store result as U8
741 vst1_u8(output.ptr(), out);
742 },
743 input, output);
744}
745
746template <>
747void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
748{
749 Iterator input(_input, win);
750 Iterator output(_output, win);
751
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100752 static const uint8x16_t zero = vdupq_n_u8(0);
753 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
754 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
755 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
756 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
757 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100759 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100760 {
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100761 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100762 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
763 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
764 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100765 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100766
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100767 std::array<uint8x8_t, 10> d =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100768 {
769 vget_low_u8(top2_data),
770 vget_high_u8(top2_data),
771 vget_low_u8(top_data),
772 vget_high_u8(top_data),
773 vget_low_u8(mid_data),
774 vget_high_u8(mid_data),
775 vget_low_u8(bot_data),
776 vget_high_u8(bot_data),
777 vget_low_u8(bot2_data),
778 vget_high_u8(bot2_data)
779 };
780
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100781 std::array<uint8x8_t, 21> p{ 0 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100782 p[0] = d[0];
783 p[1] = vext_u8(d[0], d[1], 1);
784 p[2] = vext_u8(d[0], d[1], 2);
785 p[18] = d[8];
786 p[19] = vext_u8(d[8], d[9], 1);
787 p[20] = vext_u8(d[8], d[9], 2);
788
789 for(unsigned int i = 0; i < 3; ++i)
790 {
791 const unsigned int idx_d = 2 + i * 2;
792 const unsigned int idx_p = 3 + i * 5;
793
794 p[idx_p] = d[idx_d];
795 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
796 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
797 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
798 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
799 }
800
801 sort21(p);
802
803 vst1_u8(output.ptr(), p[10]);
804 },
805 input, output);
806}
807
808template <>
809void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
810{
811 Iterator input(_input, win);
812 Iterator output(_output, win);
813
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100814 static const uint8x16_t zero = vdupq_n_u8(0);
815 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
816 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
817 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
818 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
819 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100820
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100821 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100822 {
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100823 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100824 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
825 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
826 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100827 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100828
829 const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
830 uint8x16_t rows_min_5 = vminq_u8(top_data, bot_data);
831 rows_min_5 = vminq_u8(rows_min_5, mid_data);
832
833 const uint8x8_t out_3 = min_row<3>(rows_min_3);
834 const uint8x8_t out_5 = min_row<5>(rows_min_5);
835
836 vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
837 },
838 input, output);
839}
840
841template <>
842void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
843{
844 Iterator input(_input, win);
845 Iterator output(_output, win);
846
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100847 static const uint8x16_t zero = vdupq_n_u8(0);
848 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
849 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
850 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
851 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
852 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100853
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100854 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100855 {
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100856 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100857 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
858 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
859 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
Georgios Pinitas0a7a8d12017-10-23 12:23:10 +0100860 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100861
862 const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
863 uint8x16_t rows_max_5 = vmaxq_u8(top_data, bot_data);
864 rows_max_5 = vmaxq_u8(rows_max_5, mid_data);
865
866 const uint8x8_t out_3 = max_row<3>(rows_max_3);
867 const uint8x8_t out_5 = max_row<5>(rows_max_5);
868
869 vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
870 },
871 input, output);
872}
873
874template <int mask_w, int mask_h>
875void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
876{
877 Iterator input(_input, win);
878 Iterator output(_output, win);
879 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
880
881 const int k_row_half = mask_h / 2;
882 const int k_col_half = mask_w / 2;
883 constexpr int mask_size = mask_w * mask_h;
884
885 // Set row pointers
886 std::array<unsigned char *, mask_h> input_ptrs{ {} };
887 for(int i = -k_row_half; i <= k_row_half; ++i)
888 {
889 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
890 }
891
Georgios Pinitas67d94d22018-10-09 18:48:37 +0100892 std::array<uint8_t, mask_size> vals{ {} };
893
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100894 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100895 {
Georgios Pinitas67d94d22018-10-09 18:48:37 +0100896 // Clear array
897 std::fill(std::begin(vals), std::end(vals), 0);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100898
899 size_t v = 0;
900 size_t m = 0;
901
902 for(unsigned int r = 0; r < mask_h; ++r)
903 {
904 const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
905
906 for(unsigned int c = 0; c < mask_w; ++c, ++m)
907 {
908 if(_mask[m] == 255)
909 {
910 vals[v] = in_ptr[c];
911 ++v;
912 }
913 }
914 }
915
916 // Only do something if there is at least one non-zero element in the
917 // mask
918 if(v > 0)
919 {
920 std::sort(vals.begin(), vals.begin() + v);
921
922 switch(_function)
923 {
924 case NonLinearFilterFunction::MIN:
925 *output.ptr() = vals[0];
926 break;
927 case NonLinearFilterFunction::MAX:
928 *output.ptr() = vals[v - 1];
929 break;
930 case NonLinearFilterFunction::MEDIAN:
931 *output.ptr() = vals[v / 2];
932 break;
933 default:
934 break;
935 }
936 }
937 },
938 input, output);
939}
940
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100941void NENonLinearFilterKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100942{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100943 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100944 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
945 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
946
947 using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window);
948
949 // Function table for BOX pattern
950 static const std::array<NonLinearFilterFunction, 6> func_table_box =
951 {
952 {
953 &NENonLinearFilterKernel::median_filter_box<3, 3>,
954 &NENonLinearFilterKernel::min_filter_box<3, 3>,
955 &NENonLinearFilterKernel::max_filter_box<3, 3>,
956 &NENonLinearFilterKernel::median_filter_box<5, 5>,
957 &NENonLinearFilterKernel::min_filter_box<5, 5>,
958 &NENonLinearFilterKernel::max_filter_box<5, 5>,
959 }
960 };
961
962 // Function table for CROSS pattern
963 static const std::array<NonLinearFilterFunction, 6> func_table_cross =
964 {
965 {
966 &NENonLinearFilterKernel::median_filter_cross<3, 3>,
967 &NENonLinearFilterKernel::min_filter_cross<3, 3>,
968 &NENonLinearFilterKernel::max_filter_cross<3, 3>,
969 &NENonLinearFilterKernel::median_filter_cross<5, 5>,
970 &NENonLinearFilterKernel::min_filter_cross<5, 5>,
971 &NENonLinearFilterKernel::max_filter_cross<5, 5>,
972 }
973 };
974
975 // Function table for DISK pattern
976 static const std::array<NonLinearFilterFunction, 6> func_table_disk =
977 {
978 {
979 &NENonLinearFilterKernel::median_filter_box<3, 3>,
980 &NENonLinearFilterKernel::min_filter_box<3, 3>,
981 &NENonLinearFilterKernel::max_filter_box<3, 3>,
982 &NENonLinearFilterKernel::median_filter_disk<5, 5>,
983 &NENonLinearFilterKernel::min_filter_disk<5, 5>,
984 &NENonLinearFilterKernel::max_filter_disk<5, 5>,
985 }
986 };
987
988 // Function table for OTHER pattern
989 static const std::array<NonLinearFilterFunction, 2> func_table_generic =
990 {
991 {
992 &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
993 &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
994 }
995 };
996
997 switch(_pattern)
998 {
999 case MatrixPattern::BOX:
1000 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
1001 (this->*func_table_box[_func_idx])(window);
1002 break;
1003 case MatrixPattern::CROSS:
1004 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
1005 (this->*func_table_cross[_func_idx])(window);
1006 break;
1007 case MatrixPattern::DISK:
1008 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
1009 (this->*func_table_disk[_func_idx])(window);
1010 break;
1011 case MatrixPattern::OTHER:
1012 default:
1013 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
1014 (this->*func_table_generic[_func_idx])(window);
1015 break;
1016 }
1017}
1018} // namespace arm_compute