blob: 40a3e31a3989f2d0c00b5d2c382cae27da1bd770 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33
34#include <arm_neon.h>
35#include <cstdint>
36
37using namespace arm_compute;
38
39namespace arm_compute
40{
41class Coordinates;
42} // namespace arm_compute
43
44namespace
45{
46const int32x4_t minusfour = vdupq_n_s32(-4);
47const int32x4_t minusfive = vdupq_n_s32(-5);
48const int32x4_t four = vdupq_n_s32(4);
49const int32x4_t five = vdupq_n_s32(5);
50const int32x4_t six = vdupq_n_s32(6);
51const int32x4_t fifteen = vdupq_n_s32(15);
52const int32x4_t twenty = vdupq_n_s32(20);
53
54inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
55{
56 int32x4x2_t out =
57 {
58 {
59 vnegq_s32(data.val[0]),
60 vnegq_s32(data.val[1])
61 }
62 };
63
64 out.val[0] = vmlaq_s32(out.val[0],
65 vextq_s32(data.val[0], data.val[1], 1), minusfour);
66
67 out.val[0] = vmlaq_s32(out.val[0],
68 vextq_s32(data.val[0], data.val[1], 2), minusfive);
69
70 out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
71
72 out.val[0] = vmlaq_s32(out.val[0],
73 vextq_s32(data.val[1], data.val[2], 1), four);
74
75 out.val[0] = vaddq_s32(out.val[0],
76 vextq_s32(data.val[1], data.val[2], 2));
77
78 out.val[1] = vmlaq_s32(out.val[1],
79 vextq_s32(data.val[1], data.val[2], 1), minusfour);
80
81 out.val[1] = vmlaq_s32(out.val[1],
82 vextq_s32(data.val[1], data.val[2], 2), minusfive);
83
84 out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
85
86 out.val[1] = vmlaq_s32(out.val[1],
87 vextq_s32(data.val[2], data.val[3], 1), four);
88
89 out.val[1] = vaddq_s32(out.val[1],
90 vextq_s32(data.val[2], data.val[3], 2));
91
92 return out;
93}
94
95inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
96{
97 int32x4x2_t out =
98 {
99 {
100 data.val[0],
101 data.val[1]
102 }
103 };
104
105 out.val[0] = vmlaq_s32(out.val[0],
106 vextq_s32(data.val[0], data.val[1], 1), six);
107
108 out.val[0] = vmlaq_s32(out.val[0],
109 vextq_s32(data.val[0], data.val[1], 2), fifteen);
110
111 out.val[0] = vmlaq_s32(out.val[0],
112 vextq_s32(data.val[0], data.val[1], 3), twenty);
113
114 out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
115
116 out.val[0] = vmlaq_s32(out.val[0],
117 vextq_s32(data.val[1], data.val[2], 1), six);
118
119 out.val[0] = vaddq_s32(out.val[0],
120 vextq_s32(data.val[1], data.val[2], 2));
121
122 out.val[1] = vmlaq_s32(out.val[1],
123 vextq_s32(data.val[1], data.val[2], 1), six);
124
125 out.val[1] = vmlaq_s32(out.val[1],
126 vextq_s32(data.val[1], data.val[2], 2), fifteen);
127
128 out.val[1] = vmlaq_s32(out.val[1],
129 vextq_s32(data.val[1], data.val[2], 3), twenty);
130
131 out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
132
133 out.val[1] = vmlaq_s32(out.val[1],
134 vextq_s32(data.val[2], data.val[3], 1), six);
135
136 out.val[1] = vaddq_s32(out.val[1],
137 vextq_s32(data.val[2], data.val[3], 2));
138
139 return out;
140}
141} // namespace
142
143NESobel7x7HorKernel::NESobel7x7HorKernel()
144 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
145{
146}
147
148BorderSize NESobel7x7HorKernel::border_size() const
149{
150 return _border_size;
151}
152
153void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
154{
155 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
156 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
157
158 _run_sobel_x = output_x != nullptr;
159 _run_sobel_y = output_y != nullptr;
160
161 if(_run_sobel_x)
162 {
163 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
164 }
165
166 if(_run_sobel_y)
167 {
168 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
169 }
170
171 _input = input;
172 _output_x = output_x;
173 _output_y = output_y;
174 _border_size = BorderSize(border_undefined ? 0 : 3, 3);
175
176 // Configure kernel window
177 constexpr unsigned int num_elems_processed_per_iteration = 8;
178 constexpr unsigned int num_elems_read_per_iteration = 16;
179 constexpr unsigned int num_elems_written_per_iteration = 8;
180
181 Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
182 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
183 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
184
185 update_window_and_padding(win,
186 AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
187 output_x_access,
188 output_y_access);
189
190 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
191 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
192
193 INEKernel::configure(win);
194}
195
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100196void NESobel7x7HorKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100198 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100199 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
200 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
201
202 Iterator input(_input, window);
203 Iterator output_x;
204 Iterator output_y;
205
206 if(_run_sobel_x)
207 {
208 output_x = Iterator(_output_x, window);
209 }
210
211 if(_run_sobel_y)
212 {
213 output_y = Iterator(_output_y, window);
214 }
215
216 if(_run_sobel_y && _run_sobel_x)
217 {
218 execute_window_loop(window, [&](const Coordinates & id)
219 {
220 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
221
222 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
223 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
224
225 const int32x4x4_t data_s32 =
226 {
227 {
228 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
229 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
230 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
231 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
232 }
233 };
234
235 const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
236 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
237 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
238
239 const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
240 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
241 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
242 },
243 input, output_x, output_y);
244 }
245 else if(_run_sobel_x)
246 {
247 execute_window_loop(window, [&](const Coordinates & id)
248 {
249 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
250
251 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
252 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
253
254 const int32x4x4_t data_s32 =
255 {
256 {
257 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
258 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
259 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
260 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
261 }
262 };
263
264 const int32x4x2_t out = compute_hor_sobel_x(data_s32);
265 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
266 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
267 },
268 input, output_x);
269 }
270 else if(_run_sobel_y)
271 {
272 execute_window_loop(window, [&](const Coordinates & id)
273 {
274 const uint8x16_t data = vld1q_u8(input.ptr() - 3);
275
276 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
277 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
278
279 const int32x4x4_t data_s32 =
280 {
281 {
282 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
283 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
284 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
285 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
286 }
287 };
288
Isabella Gottardi43ce8982017-11-08 11:13:23 +0000289 const int32x4x2_t out = compute_hor_sobel_y(data_s32);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100290 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
291 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
292 },
293 input, output_y);
294 }
295}
296
297NESobel7x7VertKernel::NESobel7x7VertKernel()
298 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
299{
300}
301
302BorderSize NESobel7x7VertKernel::border_size() const
303{
304 return BorderSize(3, 0);
305}
306
307void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
308{
309 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
310
311 _run_sobel_x = (output_x != nullptr);
312 _run_sobel_y = (output_y != nullptr);
313
314 if(_run_sobel_x)
315 {
316 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
317 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
318 }
319
320 if(_run_sobel_y)
321 {
322 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
323 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
324 }
325
326 _input_x = input_x;
327 _input_y = input_y;
328 _output_x = output_x;
329 _output_y = output_y;
330
331 const ITensor *const input = _run_sobel_x ? input_x : input_y;
332
333 // Configure kernel window
334 constexpr unsigned int num_elems_processed_per_iteration = 8;
335 constexpr unsigned int num_elems_read_per_iteration = 8;
336 constexpr unsigned int num_elems_written_per_iteration = 8;
337 constexpr unsigned int num_rows_read_per_iteration = 7;
338
339 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
340 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
341 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
342
343 update_window_and_padding(win,
344 AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
345 AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
346 output_x_access,
347 output_y_access);
348
349 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
350 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
351
352 INEKernel::configure(win);
353}
354
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100355void NESobel7x7VertKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100356{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100357 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100358 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
359 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
360
361 Iterator input_x;
362 Iterator input_y;
363 Iterator output_x;
364 Iterator output_y;
365
366 int32_t in_x_stride = 0;
367 int32_t in_y_stride = 0;
368
369 if(_run_sobel_x)
370 {
371 input_x = Iterator(_input_x, window);
372 output_x = Iterator(_output_x, window);
373 in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
374 }
375
376 if(_run_sobel_y)
377 {
378 input_y = Iterator(_input_y, window);
379 output_y = Iterator(_output_y, window);
380 in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
381 }
382
383 if(_run_sobel_x)
384 {
385 execute_window_loop(window, [&](const Coordinates & id)
386 {
387 auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
388
389 //top3
390 int32x4x2_t data =
391 {
392 {
393 vld1q_s32(in_ptr),
394 vld1q_s32(in_ptr + 4)
395 }
396 };
397
398 int32x4x2_t out = data;
399
400 //top2
401 in_ptr += in_x_stride;
402 data.val[0] = vld1q_s32(in_ptr);
403 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
404
405 data.val[1] = vld1q_s32(in_ptr + 4);
406 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
407
408 //top
409 in_ptr += in_x_stride;
410 data.val[0] = vld1q_s32(in_ptr);
411 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
412
413 data.val[1] = vld1q_s32(in_ptr + 4);
414 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
415
416 //mid
417 in_ptr += in_x_stride;
418 data.val[0] = vld1q_s32(in_ptr);
419 out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
420
421 data.val[1] = vld1q_s32(in_ptr + 4);
422 out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
423
424 //low
425 in_ptr += in_x_stride;
426 data.val[0] = vld1q_s32(in_ptr);
427 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
428
429 data.val[1] = vld1q_s32(in_ptr + 4);
430 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
431
432 //low2
433 in_ptr += in_x_stride;
434 data.val[0] = vld1q_s32(in_ptr);
435 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
436
437 data.val[1] = vld1q_s32(in_ptr + 4);
438 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
439
440 //low3
441 in_ptr += in_x_stride;
442 data.val[0] = vld1q_s32(in_ptr);
443 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
444
445 data.val[1] = vld1q_s32(in_ptr + 4);
446 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
447
448 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
449 vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
450 },
451 input_x, output_x);
452 }
453
454 if(_run_sobel_y)
455 {
456 execute_window_loop(window, [&](const Coordinates & id)
457 {
458 auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
459
460 //top3
461 int32x4x2_t data =
462 {
463 {
464 vld1q_s32(in_ptr),
465 vld1q_s32(in_ptr + 4)
466 }
467 };
468
469 int32x4x2_t out =
470 {
471 {
472 vnegq_s32(data.val[0]),
473 vnegq_s32(data.val[1])
474 }
475 };
476
477 //top2
478 in_ptr += in_y_stride;
479 data.val[0] = vld1q_s32(in_ptr);
480 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
481
482 data.val[1] = vld1q_s32(in_ptr + 4);
483 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
484
485 //top
486 in_ptr += in_y_stride;
487 data.val[0] = vld1q_s32(in_ptr);
488 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
489
490 data.val[1] = vld1q_s32(in_ptr + 4);
491 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
492
493 //low
494 in_ptr += (2 * in_y_stride);
495 data.val[0] = vld1q_s32(in_ptr);
496 out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
497
498 data.val[1] = vld1q_s32(in_ptr + 4);
499 out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
500
501 //low2
502 in_ptr += in_y_stride;
503 data.val[0] = vld1q_s32(in_ptr);
504 out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
505
506 data.val[1] = vld1q_s32(in_ptr + 4);
507 out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
508
509 //low3
510 in_ptr += in_y_stride;
511 data.val[0] = vld1q_s32(in_ptr);
512 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
513
514 data.val[1] = vld1q_s32(in_ptr + 4);
515 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
516
517 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
518 vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
519 },
520 input_y, output_y);
521 }
522}