blob: 5a66b1f3643aa79ee1defa3f5daf45608d7694b1 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/Window.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010033#include "src/core/helpers/AutoConfiguration.h"
34#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
36#include <arm_neon.h>
37#include <cstddef>
38#include <cstdint>
39
40using namespace arm_compute;
41
42NESobel5x5HorKernel::NESobel5x5HorKernel()
43 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
44{
45}
46
47BorderSize NESobel5x5HorKernel::border_size() const
48{
49 return _border_size;
50}
51
52void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
53{
54 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
55 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
56
57 _run_sobel_x = output_x != nullptr;
58 _run_sobel_y = output_y != nullptr;
59
60 if(_run_sobel_x)
61 {
62 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
63 }
64
65 if(_run_sobel_y)
66 {
67 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
68 }
69
70 _input = input;
71 _output_x = output_x;
72 _output_y = output_y;
73 _border_size = BorderSize(border_undefined ? 0 : 2, 2);
74
75 // Configure kernel window
76 constexpr unsigned int num_elems_processed_per_iteration = 8;
77 constexpr unsigned int num_elems_read_per_iteration = 16;
78 constexpr unsigned int num_elems_written_per_iteration = 8;
79
80 Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
81 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
82 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
83
84 update_window_and_padding(win,
85 AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
86 output_x_access,
87 output_y_access);
88
89 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
90 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
91
92 INEKernel::configure(win);
93}
94
Moritz Pflanzerc186b572017-09-07 09:48:04 +010095void NESobel5x5HorKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010096{
Moritz Pflanzerc186b572017-09-07 09:48:04 +010097 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010098 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
99 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
100
101 Window win_in(window);
102 win_in.shift(Window::DimX, -2);
103
104 Iterator input(_input, win_in);
105 Iterator output_x;
106 Iterator output_y;
107
108 if(_run_sobel_x)
109 {
110 output_x = Iterator(_output_x, window);
111 }
112
113 if(_run_sobel_y)
114 {
115 output_y = Iterator(_output_y, window);
116 }
117
118 if(_run_sobel_y && _run_sobel_x)
119 {
120 static const int16x8_t six = vdupq_n_s16(6);
121 static const int16x8_t four = vdupq_n_s16(4);
122 static const int16x8_t two = vdupq_n_s16(2);
123 static const int16x8_t minustwo = vdupq_n_s16(-2);
124
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100125 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100126 {
127 const uint8x16_t data = vld1q_u8(input.ptr());
128
129 const int16x8x2_t data_s16 =
130 {
131 {
132 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
133 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
134 }
135 };
136
137 int16x8_t out_y = data_s16.val[0];
138 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
139 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
140 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
141 out_y = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
142
143 vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
144
145 int16x8_t out_x = vnegq_s16(data_s16.val[0]);
146 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
147 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
148 out_x = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
149
150 vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
151 },
152 input, output_x, output_y);
153 }
154 else if(_run_sobel_x)
155 {
156 static const int16x8_t two = vdupq_n_s16(2);
157 static const int16x8_t minustwo = vdupq_n_s16(-2);
158
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100159 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100160 {
161 const uint8x16_t data = vld1q_u8(input.ptr());
162
163 const int16x8x2_t data_s16 =
164 {
165 {
166 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
167 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
168 }
169 };
170
171 int16x8_t out = vnegq_s16(data_s16.val[0]);
172 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
173 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
174 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
175
176 vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
177 },
178 input, output_x);
179 }
180 else if(_run_sobel_y)
181 {
182 static const int16x8_t six = vdupq_n_s16(6);
183 static const int16x8_t four = vdupq_n_s16(4);
184
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100185 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186 {
187 const uint8x16_t data = vld1q_u8(input.ptr());
188
189 const int16x8x2_t data_s16 =
190 {
191 {
192 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
193 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
194 }
195 };
196
197 int16x8_t out = data_s16.val[0];
198 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
199 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
200 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
201 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
202
203 vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
204 },
205 input, output_y);
206 }
207}
208
209NESobel5x5VertKernel::NESobel5x5VertKernel()
210 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
211{
212}
213
214BorderSize NESobel5x5VertKernel::border_size() const
215{
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100216 return BorderSize{ 2, 0 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217}
218
219void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
220{
221 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
222
223 _run_sobel_x = output_x != nullptr;
224 _run_sobel_y = output_y != nullptr;
225
226 if(_run_sobel_x)
227 {
228 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16);
229 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16);
230 }
231
232 if(_run_sobel_y)
233 {
234 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16);
235 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16);
236 }
237
238 _input_x = input_x;
239 _input_y = input_y;
240 _output_x = output_x;
241 _output_y = output_y;
242
243 const ITensor *const input = _run_sobel_x ? input_x : input_y;
244
245 // Configure kernel window
246 constexpr unsigned int num_elems_processed_per_iteration = 16;
247 constexpr unsigned int num_elems_read_per_iteration = 16;
248 constexpr unsigned int num_elems_written_per_iteration = 16;
249 constexpr unsigned int num_rows_read_per_iteration = 5;
250
251 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
252 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
253 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
254
255 update_window_and_padding(win,
256 AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
257 AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
258 output_x_access,
259 output_y_access);
260
261 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
262 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
263
264 INEKernel::configure(win);
265}
266
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100267void NESobel5x5VertKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100269 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100270 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
271 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
272
273 Iterator input_x;
274 Iterator input_y;
275 Iterator output_x;
276 Iterator output_y;
277
278 const int16_t *input_x_low2_ptr = nullptr;
279 const int16_t *input_x_low_ptr = nullptr;
280 const int16_t *input_x_mid_ptr = nullptr;
281 const int16_t *input_x_top_ptr = nullptr;
282 const int16_t *input_x_top2_ptr = nullptr;
283
284 const int16_t *input_y_low2_ptr = nullptr;
285 const int16_t *input_y_low_ptr = nullptr;
286 const int16_t *input_y_top_ptr = nullptr;
287 const int16_t *input_y_top2_ptr = nullptr;
288
289 if(_run_sobel_x)
290 {
291 input_x = Iterator(_input_x, window);
292 output_x = Iterator(_output_x, window);
293 input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
294 input_x_top_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
295 input_x_mid_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
296 input_x_low_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
297 input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
298 }
299
300 if(_run_sobel_y)
301 {
302 input_y = Iterator(_input_y, window);
303 output_y = Iterator(_output_y, window);
304 input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
305 input_y_top_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
306 input_y_low_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
307 input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
308 }
309
310 static const int16x8_t six = vdupq_n_s16(6);
311 static const int16x8_t four = vdupq_n_s16(4);
312 static const int16x8_t two = vdupq_n_s16(2);
313 static const int16x8_t minustwo = vdupq_n_s16(-2);
314
315 if(_run_sobel_x)
316 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100317 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100318 {
319 // Convert offset from uint8_t* to uint16_t*
320 const size_t input_offset_high_s16 = input_x.offset() / 2;
321 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
322
323 //HIGH DATA
324 //top2
325 int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
326 int16x8_t out_high = data_high;
327 //top
328 data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
329 out_high = vmlaq_s16(out_high, data_high, four);
330 //mid
331 data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
332 out_high = vmlaq_s16(out_high, data_high, six);
333 //low
334 data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
335 out_high = vmlaq_s16(out_high, data_high, four);
336 //low2
337 data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
338 out_high = vaddq_s16(out_high, data_high);
339
340 vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
341
342 //LOW DATA
343 //top2
344 int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
345 int16x8_t out_low = data_low;
346 //top
347 data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
348 out_low = vmlaq_s16(out_low, data_low, four);
349 //mid
350 data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
351 out_low = vmlaq_s16(out_low, data_low, six);
352 //low
353 data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
354 out_low = vmlaq_s16(out_low, data_low, four);
355 //low2
356 data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
357 out_low = vaddq_s16(out_low, data_low);
358
359 vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
360 },
361 input_x, output_x);
362 }
363
364 if(_run_sobel_y)
365 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100366 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100367 {
368 // Convert offset from uint8_t* to uint16_t*
369 const size_t input_offset_high_s16 = input_y.offset() / 2;
370 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
371
372 //HIGH DATA
373 //top2
374 int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
375 int16x8_t out_high = vnegq_s16(data_high);
376 //top
377 data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
378 out_high = vmlaq_s16(out_high, data_high, minustwo);
379 //low
380 data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
381 out_high = vmlaq_s16(out_high, data_high, two);
382 //low2
383 data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
384 out_high = vaddq_s16(out_high, data_high);
385
386 vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
387
388 //LOW DATA
389 //top2
390 int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
391 int16x8_t out_low = vnegq_s16(data_low);
392 //top
393 data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
394 out_low = vmlaq_s16(out_low, data_low, minustwo);
395 //low
396 data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
397 out_low = vmlaq_s16(out_low, data_low, two);
398 //low2
399 data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
400 out_low = vaddq_s16(out_low, data_low);
401
402 vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
403 },
404 input_y, output_y);
405 }
406}