blob: a92cfc2308fbf203b4bb423bcd164db736328a2b [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michalis Spyroua4f378d2019-04-26 14:54:54 +01002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/Window.h"
33
34#include <arm_neon.h>
35#include <cstddef>
36#include <cstdint>
37
38using namespace arm_compute;
39
40NESobel5x5HorKernel::NESobel5x5HorKernel()
41 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
42{
43}
44
45BorderSize NESobel5x5HorKernel::border_size() const
46{
47 return _border_size;
48}
49
50void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
51{
52 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
53 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
54
55 _run_sobel_x = output_x != nullptr;
56 _run_sobel_y = output_y != nullptr;
57
58 if(_run_sobel_x)
59 {
60 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
61 }
62
63 if(_run_sobel_y)
64 {
65 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
66 }
67
68 _input = input;
69 _output_x = output_x;
70 _output_y = output_y;
71 _border_size = BorderSize(border_undefined ? 0 : 2, 2);
72
73 // Configure kernel window
74 constexpr unsigned int num_elems_processed_per_iteration = 8;
75 constexpr unsigned int num_elems_read_per_iteration = 16;
76 constexpr unsigned int num_elems_written_per_iteration = 8;
77
78 Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
79 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
80 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
81
82 update_window_and_padding(win,
83 AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
84 output_x_access,
85 output_y_access);
86
87 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
88 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
89
90 INEKernel::configure(win);
91}
92
Moritz Pflanzerc186b572017-09-07 09:48:04 +010093void NESobel5x5HorKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010094{
Moritz Pflanzerc186b572017-09-07 09:48:04 +010095 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010096 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
97 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
98
99 Window win_in(window);
100 win_in.shift(Window::DimX, -2);
101
102 Iterator input(_input, win_in);
103 Iterator output_x;
104 Iterator output_y;
105
106 if(_run_sobel_x)
107 {
108 output_x = Iterator(_output_x, window);
109 }
110
111 if(_run_sobel_y)
112 {
113 output_y = Iterator(_output_y, window);
114 }
115
116 if(_run_sobel_y && _run_sobel_x)
117 {
118 static const int16x8_t six = vdupq_n_s16(6);
119 static const int16x8_t four = vdupq_n_s16(4);
120 static const int16x8_t two = vdupq_n_s16(2);
121 static const int16x8_t minustwo = vdupq_n_s16(-2);
122
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100123 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124 {
125 const uint8x16_t data = vld1q_u8(input.ptr());
126
127 const int16x8x2_t data_s16 =
128 {
129 {
130 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
131 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
132 }
133 };
134
135 int16x8_t out_y = data_s16.val[0];
136 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
137 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
138 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
139 out_y = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
140
141 vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
142
143 int16x8_t out_x = vnegq_s16(data_s16.val[0]);
144 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
145 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
146 out_x = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
147
148 vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
149 },
150 input, output_x, output_y);
151 }
152 else if(_run_sobel_x)
153 {
154 static const int16x8_t two = vdupq_n_s16(2);
155 static const int16x8_t minustwo = vdupq_n_s16(-2);
156
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100157 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158 {
159 const uint8x16_t data = vld1q_u8(input.ptr());
160
161 const int16x8x2_t data_s16 =
162 {
163 {
164 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
165 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
166 }
167 };
168
169 int16x8_t out = vnegq_s16(data_s16.val[0]);
170 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
171 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
172 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
173
174 vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
175 },
176 input, output_x);
177 }
178 else if(_run_sobel_y)
179 {
180 static const int16x8_t six = vdupq_n_s16(6);
181 static const int16x8_t four = vdupq_n_s16(4);
182
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100183 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100184 {
185 const uint8x16_t data = vld1q_u8(input.ptr());
186
187 const int16x8x2_t data_s16 =
188 {
189 {
190 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
191 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
192 }
193 };
194
195 int16x8_t out = data_s16.val[0];
196 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
197 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
198 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
199 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
200
201 vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
202 },
203 input, output_y);
204 }
205}
206
207NESobel5x5VertKernel::NESobel5x5VertKernel()
208 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
209{
210}
211
212BorderSize NESobel5x5VertKernel::border_size() const
213{
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100214 return BorderSize{ 2, 0 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100215}
216
217void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
218{
219 ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
220
221 _run_sobel_x = output_x != nullptr;
222 _run_sobel_y = output_y != nullptr;
223
224 if(_run_sobel_x)
225 {
226 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16);
227 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16);
228 }
229
230 if(_run_sobel_y)
231 {
232 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16);
233 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16);
234 }
235
236 _input_x = input_x;
237 _input_y = input_y;
238 _output_x = output_x;
239 _output_y = output_y;
240
241 const ITensor *const input = _run_sobel_x ? input_x : input_y;
242
243 // Configure kernel window
244 constexpr unsigned int num_elems_processed_per_iteration = 16;
245 constexpr unsigned int num_elems_read_per_iteration = 16;
246 constexpr unsigned int num_elems_written_per_iteration = 16;
247 constexpr unsigned int num_rows_read_per_iteration = 5;
248
249 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
250 AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
251 AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
252
253 update_window_and_padding(win,
254 AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
255 AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
256 output_x_access,
257 output_y_access);
258
259 output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
260 output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
261
262 INEKernel::configure(win);
263}
264
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100265void NESobel5x5VertKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100266{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100267 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
269 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
270
271 Iterator input_x;
272 Iterator input_y;
273 Iterator output_x;
274 Iterator output_y;
275
276 const int16_t *input_x_low2_ptr = nullptr;
277 const int16_t *input_x_low_ptr = nullptr;
278 const int16_t *input_x_mid_ptr = nullptr;
279 const int16_t *input_x_top_ptr = nullptr;
280 const int16_t *input_x_top2_ptr = nullptr;
281
282 const int16_t *input_y_low2_ptr = nullptr;
283 const int16_t *input_y_low_ptr = nullptr;
284 const int16_t *input_y_top_ptr = nullptr;
285 const int16_t *input_y_top2_ptr = nullptr;
286
287 if(_run_sobel_x)
288 {
289 input_x = Iterator(_input_x, window);
290 output_x = Iterator(_output_x, window);
291 input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
292 input_x_top_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
293 input_x_mid_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
294 input_x_low_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
295 input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
296 }
297
298 if(_run_sobel_y)
299 {
300 input_y = Iterator(_input_y, window);
301 output_y = Iterator(_output_y, window);
302 input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
303 input_y_top_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
304 input_y_low_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
305 input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
306 }
307
308 static const int16x8_t six = vdupq_n_s16(6);
309 static const int16x8_t four = vdupq_n_s16(4);
310 static const int16x8_t two = vdupq_n_s16(2);
311 static const int16x8_t minustwo = vdupq_n_s16(-2);
312
313 if(_run_sobel_x)
314 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100315 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100316 {
317 // Convert offset from uint8_t* to uint16_t*
318 const size_t input_offset_high_s16 = input_x.offset() / 2;
319 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
320
321 //HIGH DATA
322 //top2
323 int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
324 int16x8_t out_high = data_high;
325 //top
326 data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
327 out_high = vmlaq_s16(out_high, data_high, four);
328 //mid
329 data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
330 out_high = vmlaq_s16(out_high, data_high, six);
331 //low
332 data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
333 out_high = vmlaq_s16(out_high, data_high, four);
334 //low2
335 data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
336 out_high = vaddq_s16(out_high, data_high);
337
338 vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
339
340 //LOW DATA
341 //top2
342 int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
343 int16x8_t out_low = data_low;
344 //top
345 data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
346 out_low = vmlaq_s16(out_low, data_low, four);
347 //mid
348 data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
349 out_low = vmlaq_s16(out_low, data_low, six);
350 //low
351 data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
352 out_low = vmlaq_s16(out_low, data_low, four);
353 //low2
354 data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
355 out_low = vaddq_s16(out_low, data_low);
356
357 vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
358 },
359 input_x, output_x);
360 }
361
362 if(_run_sobel_y)
363 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100364 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100365 {
366 // Convert offset from uint8_t* to uint16_t*
367 const size_t input_offset_high_s16 = input_y.offset() / 2;
368 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
369
370 //HIGH DATA
371 //top2
372 int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
373 int16x8_t out_high = vnegq_s16(data_high);
374 //top
375 data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
376 out_high = vmlaq_s16(out_high, data_high, minustwo);
377 //low
378 data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
379 out_high = vmlaq_s16(out_high, data_high, two);
380 //low2
381 data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
382 out_high = vaddq_s16(out_high, data_high);
383
384 vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
385
386 //LOW DATA
387 //top2
388 int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
389 int16x8_t out_low = vnegq_s16(data_low);
390 //top
391 data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
392 out_low = vmlaq_s16(out_low, data_low, minustwo);
393 //low
394 data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
395 out_low = vmlaq_s16(out_low, data_low, two);
396 //low2
397 data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
398 out_low = vaddq_s16(out_low, data_low);
399
400 vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
401 },
402 input_y, output_y);
403 }
404}