blob: 1e41ddcf80ed0a5fda3f007d4f330fb908ee8dbe [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
25
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/IAccessWindow.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Types.h"
33#include "arm_compute/core/Validate.h"
steniu014c2938e2017-06-19 15:44:45 +010034#include "arm_compute/core/Window.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
steniu014c2938e2017-06-19 15:44:45 +010036#include <algorithm>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037#include <arm_neon.h>
38#include <climits>
39#include <cstddef>
40
41namespace arm_compute
42{
43NEMinMaxKernel::NEMinMaxKernel()
44 : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
45{
46}
47
48void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max)
49{
50 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
51 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
52 ARM_COMPUTE_ERROR_ON(nullptr == min);
53 ARM_COMPUTE_ERROR_ON(nullptr == max);
54
55 _input = input;
56 _min = min;
57 _max = max;
58
59 switch(input->info()->format())
60 {
61 case Format::U8:
62 _min_init = UCHAR_MAX;
63 _max_init = 0;
64 _func = &NEMinMaxKernel::minmax_U8;
65 break;
66 case Format::S16:
67 _min_init = SHRT_MAX;
68 _max_init = SHRT_MIN;
69 _func = &NEMinMaxKernel::minmax_S16;
70 break;
71 default:
72 ARM_COMPUTE_ERROR("You called with the wrong img formats");
73 break;
74 }
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076 // Configure kernel window
steniu014c2938e2017-06-19 15:44:45 +010077 constexpr unsigned int num_elems_processed_per_iteration = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078
steniu014c2938e2017-06-19 15:44:45 +010079 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010080
81 INEKernel::configure(win);
82}
83
84void NEMinMaxKernel::run(const Window &window)
85{
86 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
87 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
88 ARM_COMPUTE_ERROR_ON(_func == nullptr);
89
90 (this->*_func)(window);
91}
92
93void NEMinMaxKernel::reset()
94{
95 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
96 *_min = _min_init;
97 *_max = _max_init;
98}
99
100template <typename T>
101void NEMinMaxKernel::update_min_max(const T min, const T max)
102{
103 std::lock_guard<std::mutex> lock(_mtx);
104
105 if(min < *_min)
106 {
107 *_min = min;
108 }
109
110 if(max > *_max)
111 {
112 *_max = max;
113 }
114}
115
steniu014c2938e2017-06-19 15:44:45 +0100116void NEMinMaxKernel::minmax_U8(Window win)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100117{
118 uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
119 uint8x8_t carry_max = vdup_n_u8(0);
120
steniu014c2938e2017-06-19 15:44:45 +0100121 uint8_t carry_max_scalar = 0;
122 uint8_t carry_min_scalar = UCHAR_MAX;
123
124 const int x_start = win.x().start();
125 const int x_end = win.x().end();
126
127 // Handle X dimension manually to split into two loops
128 // First one will use vector operations, second one processes the left over pixels
129 win.set(Window::DimX, Window::Dimension(0, 1, 1));
130
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100131 Iterator input(_input, win);
132
133 execute_window_loop(win, [&](const Coordinates & id)
134 {
steniu014c2938e2017-06-19 15:44:45 +0100135 int x = x_start;
136
137 // Vector loop
138 for(; x <= x_end - 16; x += 16)
139 {
140 const uint8x16_t pixels = vld1q_u8(input.ptr() + x);
141 const uint8x8_t tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
142 const uint8x8_t tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
143 carry_min = vmin_u8(tmp_min, carry_min);
144 carry_max = vmax_u8(tmp_max, carry_max);
145 }
146
147 // Process leftover pixels
148 for(; x < x_end; ++x)
149 {
150 const uint8_t pixel = input.ptr()[x];
151 carry_min_scalar = std::min(pixel, carry_min_scalar);
152 carry_max_scalar = std::max(pixel, carry_max_scalar);
153 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100154 },
155 input);
156
157 // Reduce result
158 carry_min = vpmin_u8(carry_min, carry_min);
159 carry_max = vpmax_u8(carry_max, carry_max);
160 carry_min = vpmin_u8(carry_min, carry_min);
161 carry_max = vpmax_u8(carry_max, carry_max);
162 carry_min = vpmin_u8(carry_min, carry_min);
163 carry_max = vpmax_u8(carry_max, carry_max);
164
165 // Extract max/min values
steniu014c2938e2017-06-19 15:44:45 +0100166 const uint8_t min_i = std::min(vget_lane_u8(carry_min, 0), carry_min_scalar);
167 const uint8_t max_i = std::max(vget_lane_u8(carry_max, 0), carry_max_scalar);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100168
169 // Perform reduction of local min/max values
170 update_min_max(min_i, max_i);
171}
172
steniu014c2938e2017-06-19 15:44:45 +0100173void NEMinMaxKernel::minmax_S16(Window win)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174{
175 int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
176 int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
177
steniu014c2938e2017-06-19 15:44:45 +0100178 int16_t carry_max_scalar = SHRT_MIN;
179 int16_t carry_min_scalar = SHRT_MAX;
180
181 const int x_start = win.x().start();
182 const int x_end = win.x().end();
183
184 // Handle X dimension manually to split into two loops
185 // First one will use vector operations, second one processes the left over pixels
186 win.set(Window::DimX, Window::Dimension(0, 1, 1));
187
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 Iterator input(_input, win);
189
190 execute_window_loop(win, [&](const Coordinates & id)
191 {
steniu014c2938e2017-06-19 15:44:45 +0100192 int x = x_start;
193 const auto in_ptr = reinterpret_cast<const int16_t *const>(input.ptr());
194
195 // Vector loop
196 for(; x <= x_end - 16; x += 16)
197 {
198 const int16x8x2_t pixels = vld2q_s16(in_ptr + x);
199 const int16x8_t tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
200 const int16x8_t tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
201 const int16x4_t tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
202 const int16x4_t tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
203 carry_min = vmin_s16(tmp_min2, carry_min);
204 carry_max = vmax_s16(tmp_max2, carry_max);
205 }
206
207 // Process leftover pixels
208 for(; x < x_end; ++x)
209 {
210 const int16_t pixel = in_ptr[x];
211 carry_min_scalar = std::min(pixel, carry_min_scalar);
212 carry_max_scalar = std::max(pixel, carry_max_scalar);
213 }
214
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100215 },
216 input);
217
218 // Reduce result
219 carry_min = vpmin_s16(carry_min, carry_min);
220 carry_max = vpmax_s16(carry_max, carry_max);
221 carry_min = vpmin_s16(carry_min, carry_min);
222 carry_max = vpmax_s16(carry_max, carry_max);
223
224 // Extract max/min values
steniu014c2938e2017-06-19 15:44:45 +0100225 const int16_t min_i = std::min(vget_lane_s16(carry_min, 0), carry_min_scalar);
226 const int16_t max_i = std::max(vget_lane_s16(carry_max, 0), carry_max_scalar);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100227
228 // Perform reduction of local min/max values
229 update_min_max(min_i, max_i);
230}
231
232NEMinMaxLocationKernel::NEMinMaxLocationKernel()
steniu014c2938e2017-06-19 15:44:45 +0100233 : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234{
235}
236
237bool NEMinMaxLocationKernel::is_parallelisable() const
238{
239 return false;
240}
241
242template <unsigned int...>
243struct index_seq
244{
245 index_seq() = default;
246 index_seq(const index_seq &) = default;
247 index_seq &operator=(const index_seq &) = default;
248 index_seq(index_seq &&) noexcept = default;
249 index_seq &operator=(index_seq &&) noexcept = default;
250 virtual ~index_seq() = default;
251};
252template <unsigned int N, unsigned int... S>
253struct gen_index_seq : gen_index_seq < N - 1, N - 1, S... >
254{
255};
256template <unsigned int... S>
257struct gen_index_seq<0u, S...> : index_seq<S...>
258{
259 using type = index_seq<S...>;
260};
261
262template <class T, unsigned int... N>
263struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
264{
265 static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
266};
267
268template <class T, unsigned int... N>
269const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>::func_table[sizeof...(N)] =
270{
271 &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
272};
273
274void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
275 ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
276 uint32_t *min_count, uint32_t *max_count)
277{
278 ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
279 ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16);
280 ARM_COMPUTE_ERROR_ON(nullptr == min);
281 ARM_COMPUTE_ERROR_ON(nullptr == max);
282
283 _input = input;
284 _min = min;
285 _max = max;
286 _min_count = min_count;
287 _max_count = max_count;
288 _min_loc = min_loc;
289 _max_loc = max_loc;
290
291 unsigned int count_min = (nullptr != min_count ? 1 : 0);
292 unsigned int count_max = (nullptr != max_count ? 1 : 0);
293 unsigned int loc_min = (nullptr != min_loc ? 1 : 0);
294 unsigned int loc_max = (nullptr != max_loc ? 1 : 0);
295
296 unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
297
298 switch(input->info()->format())
299 {
300 case Format::U8:
301 _func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
302 break;
303 case Format::S16:
304 _func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
305 break;
306 default:
307 ARM_COMPUTE_ERROR("You called with the wrong img formats");
308 break;
309 }
310
steniu014c2938e2017-06-19 15:44:45 +0100311 constexpr unsigned int num_elems_processed_per_iteration = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100312
313 // Configure kernel window
steniu014c2938e2017-06-19 15:44:45 +0100314 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315
steniu014c2938e2017-06-19 15:44:45 +0100316 update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317
318 INEKernel::configure(win);
319}
320
321void NEMinMaxLocationKernel::run(const Window &window)
322{
323 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
324 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
325 ARM_COMPUTE_ERROR_ON(_func == nullptr);
326
327 (this->*_func)(window);
328}
329
330template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
331void NEMinMaxLocationKernel::minmax_loc(const Window &win)
332{
333 if(count_min || count_max || loc_min || loc_max)
334 {
335 Iterator input(_input, win);
336
steniu014c2938e2017-06-19 15:44:45 +0100337 size_t min_count = 0;
338 size_t max_count = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100339
340 // Clear min location array
341 if(loc_min)
342 {
343 _min_loc->clear();
344 }
345
346 // Clear max location array
347 if(loc_max)
348 {
349 _max_loc->clear();
350 }
351
352 execute_window_loop(win, [&](const Coordinates & id)
353 {
354 auto in_ptr = reinterpret_cast<const T *>(input.ptr());
355 int32_t idx = id.x();
356 int32_t idy = id.y();
357
steniu014c2938e2017-06-19 15:44:45 +0100358 const T pixel = *in_ptr;
359 Coordinates2D p{ idx, idy };
360
361 if(count_min || loc_min)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100362 {
steniu014c2938e2017-06-19 15:44:45 +0100363 if(*_min == pixel)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364 {
steniu014c2938e2017-06-19 15:44:45 +0100365 if(count_min)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100366 {
steniu014c2938e2017-06-19 15:44:45 +0100367 ++min_count;
368 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100369
steniu014c2938e2017-06-19 15:44:45 +0100370 if(loc_min)
371 {
372 _min_loc->push_back(p);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100373 }
374 }
steniu014c2938e2017-06-19 15:44:45 +0100375 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100376
steniu014c2938e2017-06-19 15:44:45 +0100377 if(count_max || loc_max)
378 {
379 if(*_max == pixel)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100380 {
steniu014c2938e2017-06-19 15:44:45 +0100381 if(count_max)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100382 {
steniu014c2938e2017-06-19 15:44:45 +0100383 ++max_count;
384 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100385
steniu014c2938e2017-06-19 15:44:45 +0100386 if(loc_max)
387 {
388 _max_loc->push_back(p);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100389 }
390 }
391 }
392 },
393 input);
394
395 if(count_min)
396 {
397 *_min_count = min_count;
398 }
399
400 if(count_max)
401 {
402 *_max_count = max_count;
403 }
404 }
405}
406} // namespace arm_compute