blob: 9761942c69ac2e077d57003ce9c73898eb6cb11a [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33
34#include <arm_neon.h>
35#include <cstdint>
36
37using namespace arm_compute;
38
39namespace arm_compute
40{
41class Coordinates;
42} // namespace arm_compute
43
44namespace
45{
46const int32x4_t minusfour = vdupq_n_s32(-4);
47const int32x4_t minusfive = vdupq_n_s32(-5);
48const int32x4_t four = vdupq_n_s32(4);
49const int32x4_t five = vdupq_n_s32(5);
50const int32x4_t six = vdupq_n_s32(6);
51const int32x4_t fifteen = vdupq_n_s32(15);
52const int32x4_t twenty = vdupq_n_s32(20);
53
54inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
55{
56 int32x4x2_t out =
57 {
58 {
59 vnegq_s32(data.val[0]),
60 vnegq_s32(data.val[1])
61 }
62 };
63
64 out.val[0] = vmlaq_s32(out.val[0],
65 vextq_s32(data.val[0], data.val[1], 1), minusfour);
66
67 out.val[0] = vmlaq_s32(out.val[0],
68 vextq_s32(data.val[0], data.val[1], 2), minusfive);
69
70 out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
71
72 out.val[0] = vmlaq_s32(out.val[0],
73 vextq_s32(data.val[1], data.val[2], 1), four);
74
75 out.val[0] = vaddq_s32(out.val[0],
76 vextq_s32(data.val[1], data.val[2], 2));
77
78 out.val[1] = vmlaq_s32(out.val[1],
79 vextq_s32(data.val[1], data.val[2], 1), minusfour);
80
81 out.val[1] = vmlaq_s32(out.val[1],
82 vextq_s32(data.val[1], data.val[2], 2), minusfive);
83
84 out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
85
86 out.val[1] = vmlaq_s32(out.val[1],
87 vextq_s32(data.val[2], data.val[3], 1), four);
88
89 out.val[1] = vaddq_s32(out.val[1],
90 vextq_s32(data.val[2], data.val[3], 2));
91
92 return out;
93}
94
95inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
96{
97 int32x4x2_t out =
98 {
99 {
100 data.val[0],
101 data.val[1]
102 }
103 };
104
105 out.val[0] = vmlaq_s32(out.val[0],
106 vextq_s32(data.val[0], data.val[1], 1), six);
107
108 out.val[0] = vmlaq_s32(out.val[0],
109 vextq_s32(data.val[0], data.val[1], 2), fifteen);
110
111 out.val[0] = vmlaq_s32(out.val[0],
112 vextq_s32(data.val[0], data.val[1], 3), twenty);
113
114 out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
115
116 out.val[0] = vmlaq_s32(out.val[0],
117 vextq_s32(data.val[1], data.val[2], 1), six);
118
119 out.val[0] = vaddq_s32(out.val[0],
120 vextq_s32(data.val[1], data.val[2], 2));
121
122 out.val[1] = vmlaq_s32(out.val[1],
123 vextq_s32(data.val[1], data.val[2], 1), six);
124
125 out.val[1] = vmlaq_s32(out.val[1],
126 vextq_s32(data.val[1], data.val[2], 2), fifteen);
127
128 out.val[1] = vmlaq_s32(out.val[1],
129 vextq_s32(data.val[1], data.val[2], 3), twenty);
130
131 out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
132
133 out.val[1] = vmlaq_s32(out.val[1],
134 vextq_s32(data.val[2], data.val[3], 1), six);
135
136 out.val[1] = vaddq_s32(out.val[1],
137 vextq_s32(data.val[2], data.val[3], 2));
138
139 return out;
140}
141} // namespace
142
143NESobel7x7HorKernel::NESobel7x7HorKernel()
144 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
145{
146}
147
148BorderSize NESobel7x7HorKernel::border_size() const
149{
150 return _border_size;
151}
152
153void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
154{
155 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
156 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
157
158 _run_sobel_x = output_x != nullptr;
159 _run_sobel_y = output_y != nullptr;
160
161 if(_run_sobel_x)
162 {
163 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
164 }
165
166 if(_run_sobel_y)
167 {
168 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
169 }
170
171 _input = input;
172 _output_x = output_x;
173 _output_y = output_y;
174 _border_size = BorderSize(border_undefined ? 0 : 3, 3);
175
176 // Configure kernel window
177 constexpr unsigned int num_elems_processed_per_iteration = 8;
178 constexpr unsigned int num_elems_read_per_iteration = 16;
179 constexpr unsigned int num_elems_written_per_iteration = 8;
180
181 Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
182 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
183 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
184
185 update_window_and_padding(win,
186 AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
187 output_x_access,
188 output_y_access);
189
190 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
191 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
192
193 INEKernel::configure(win);
194}
195
196void NESobel7x7HorKernel::run(const Window &window)
197{
198 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
199 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
200
201 Iterator input(_input, window);
202 Iterator output_x;
203 Iterator output_y;
204
205 if(_run_sobel_x)
206 {
207 output_x = Iterator(_output_x, window);
208 }
209
210 if(_run_sobel_y)
211 {
212 output_y = Iterator(_output_y, window);
213 }
214
215 if(_run_sobel_y && _run_sobel_x)
216 {
217 execute_window_loop(window, [&](const Coordinates & id)
218 {
219 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
220
221 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
222 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
223
224 const int32x4x4_t data_s32 =
225 {
226 {
227 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
228 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
229 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
230 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
231 }
232 };
233
234 const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
235 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
236 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
237
238 const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
239 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
240 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
241 },
242 input, output_x, output_y);
243 }
244 else if(_run_sobel_x)
245 {
246 execute_window_loop(window, [&](const Coordinates & id)
247 {
248 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
249
250 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
251 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
252
253 const int32x4x4_t data_s32 =
254 {
255 {
256 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
257 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
258 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
259 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
260 }
261 };
262
263 const int32x4x2_t out = compute_hor_sobel_x(data_s32);
264 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
265 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
266 },
267 input, output_x);
268 }
269 else if(_run_sobel_y)
270 {
271 execute_window_loop(window, [&](const Coordinates & id)
272 {
273 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
274
275 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
276 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
277
278 const int32x4x4_t data_s32 =
279 {
280 {
281 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
282 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
283 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
284 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
285 }
286 };
287
288 const int32x4x2_t out = compute_hor_sobel_x(data_s32);
289 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
290 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
291 },
292 input, output_y);
293 }
294}
295
296NESobel7x7VertKernel::NESobel7x7VertKernel()
297 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
298{
299}
300
301BorderSize NESobel7x7VertKernel::border_size() const
302{
303 return BorderSize(3, 0);
304}
305
306void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
307{
308 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
309
310 _run_sobel_x = (output_x != nullptr);
311 _run_sobel_y = (output_y != nullptr);
312
313 if(_run_sobel_x)
314 {
315 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
316 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
317 }
318
319 if(_run_sobel_y)
320 {
321 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
322 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
323 }
324
325 _input_x = input_x;
326 _input_y = input_y;
327 _output_x = output_x;
328 _output_y = output_y;
329
330 const ITensor *const input = _run_sobel_x ? input_x : input_y;
331
332 // Configure kernel window
333 constexpr unsigned int num_elems_processed_per_iteration = 8;
334 constexpr unsigned int num_elems_read_per_iteration = 8;
335 constexpr unsigned int num_elems_written_per_iteration = 8;
336 constexpr unsigned int num_rows_read_per_iteration = 7;
337
338 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
339 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
340 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
341
342 update_window_and_padding(win,
343 AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
344 AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
345 output_x_access,
346 output_y_access);
347
348 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
349 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
350
351 INEKernel::configure(win);
352}
353
354void NESobel7x7VertKernel::run(const Window &window)
355{
356 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
357 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
358
359 Iterator input_x;
360 Iterator input_y;
361 Iterator output_x;
362 Iterator output_y;
363
364 int32_t in_x_stride = 0;
365 int32_t in_y_stride = 0;
366
367 if(_run_sobel_x)
368 {
369 input_x = Iterator(_input_x, window);
370 output_x = Iterator(_output_x, window);
371 in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
372 }
373
374 if(_run_sobel_y)
375 {
376 input_y = Iterator(_input_y, window);
377 output_y = Iterator(_output_y, window);
378 in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
379 }
380
381 if(_run_sobel_x)
382 {
383 execute_window_loop(window, [&](const Coordinates & id)
384 {
385 auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
386
387 //top3
388 int32x4x2_t data =
389 {
390 {
391 vld1q_s32(in_ptr),
392 vld1q_s32(in_ptr + 4)
393 }
394 };
395
396 int32x4x2_t out = data;
397
398 //top2
399 in_ptr += in_x_stride;
400 data.val[0] = vld1q_s32(in_ptr);
401 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
402
403 data.val[1] = vld1q_s32(in_ptr + 4);
404 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
405
406 //top
407 in_ptr += in_x_stride;
408 data.val[0] = vld1q_s32(in_ptr);
409 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
410
411 data.val[1] = vld1q_s32(in_ptr + 4);
412 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
413
414 //mid
415 in_ptr += in_x_stride;
416 data.val[0] = vld1q_s32(in_ptr);
417 out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
418
419 data.val[1] = vld1q_s32(in_ptr + 4);
420 out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
421
422 //low
423 in_ptr += in_x_stride;
424 data.val[0] = vld1q_s32(in_ptr);
425 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
426
427 data.val[1] = vld1q_s32(in_ptr + 4);
428 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
429
430 //low2
431 in_ptr += in_x_stride;
432 data.val[0] = vld1q_s32(in_ptr);
433 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
434
435 data.val[1] = vld1q_s32(in_ptr + 4);
436 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
437
438 //low3
439 in_ptr += in_x_stride;
440 data.val[0] = vld1q_s32(in_ptr);
441 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
442
443 data.val[1] = vld1q_s32(in_ptr + 4);
444 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
445
446 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
447 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
448 },
449 input_x, output_x);
450 }
451
452 if(_run_sobel_y)
453 {
454 execute_window_loop(window, [&](const Coordinates & id)
455 {
456 auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
457
458 //top3
459 int32x4x2_t data =
460 {
461 {
462 vld1q_s32(in_ptr),
463 vld1q_s32(in_ptr + 4)
464 }
465 };
466
467 int32x4x2_t out =
468 {
469 {
470 vnegq_s32(data.val[0]),
471 vnegq_s32(data.val[1])
472 }
473 };
474
475 //top2
476 in_ptr += in_y_stride;
477 data.val[0] = vld1q_s32(in_ptr);
478 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
479
480 data.val[1] = vld1q_s32(in_ptr + 4);
481 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
482
483 //top
484 in_ptr += in_y_stride;
485 data.val[0] = vld1q_s32(in_ptr);
486 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
487
488 data.val[1] = vld1q_s32(in_ptr + 4);
489 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
490
491 //low
492 in_ptr += (2 * in_y_stride);
493 data.val[0] = vld1q_s32(in_ptr);
494 out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
495
496 data.val[1] = vld1q_s32(in_ptr + 4);
497 out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
498
499 //low2
500 in_ptr += in_y_stride;
501 data.val[0] = vld1q_s32(in_ptr);
502 out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
503
504 data.val[1] = vld1q_s32(in_ptr + 4);
505 out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
506
507 //low3
508 in_ptr += in_y_stride;
509 data.val[0] = vld1q_s32(in_ptr);
510 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
511
512 data.val[1] = vld1q_s32(in_ptr + 4);
513 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
514
515 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
516 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
517 },
518 input_y, output_y);
519 }
520}