blob: fd2978de1cd245396c1b357d07d1594e8f54e940 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Coordinates.h"
28#include "arm_compute/core/Error.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
35#include <arm_neon.h>
36#include <cstddef>
37#include <cstdint>
38
39using namespace arm_compute;
40
41NEScaleKernel::NEScaleKernel()
42 : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr)
43{
44}
45
46BorderSize NEScaleKernel::border_size() const
47{
48 return BorderSize(1);
49}
50
51void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
52{
53 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
54 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
55
56 if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
57 {
58 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
59 }
60
61 if(policy == InterpolationPolicy::BILINEAR)
62 {
63 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
64 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
65 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
66 }
67
68 ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
69 ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
70
71 for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
72 {
73 ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
74 }
75
76 _input = input;
77 _output = output;
78 _offsets = offsets;
79 _dx = dx;
80 _dy = dy;
81
82 switch(policy)
83 {
84 case InterpolationPolicy::NEAREST_NEIGHBOR:
85 {
86 _func = &NEScaleKernel::scale_nearest;
87 break;
88 }
89 case InterpolationPolicy::BILINEAR:
90 {
91 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32);
92 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32);
93
94 _func = &NEScaleKernel::scale_bilinear;
95 break;
96 }
97 case InterpolationPolicy::AREA:
98 {
99 _func = &NEScaleKernel::scale_area;
100 break;
101 }
102 default:
103 ARM_COMPUTE_ERROR("Unsupported interpolation mode");
104 }
105
106 constexpr unsigned int num_elems_processed_per_iteration = 16;
107 const int border_offset = (border_undefined) ? 0 : border_size().left;
108
109 // Configure kernel window
110 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
111
112 AccessWindowStatic input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
113 AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
114 AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
115 AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
116 AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
117
118 update_window_and_padding(win,
119 input_access,
120 offsets_access,
121 dx_access,
122 dy_access,
123 output_access);
124
125 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
126
127 INEKernel::configure(win);
128}
129
130void NEScaleKernel::scale_nearest(const Window &window)
131{
132 const size_t input_stride = _input->info()->strides_in_bytes()[1];
133
134 // Compute the ratio between source height and destination height
135 const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
136
137 // Don't increment in X and Y direction for the input tensor
138 // A pointer to the start of this plane is needed as base for the precomputed offsets
139 Window win_in(window);
140 win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
141 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
142
143 Window win_off;
144 win_off.set(Window::DimX, window[Window::DimX]);
145 win_off.set(Window::DimY, window[Window::DimY]);
146
147 for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
148 {
149 win_off.set(d, Window::Dimension(0, 0, 0));
150 }
151
152 Iterator in(_input, win_in);
153 Iterator out(_output, window);
154 Iterator offsets(_offsets, win_off);
155
156 switch(_input->info()->data_type())
157 {
158 case DataType::U8:
159 {
160 uint8x16_t tmp = vdupq_n_u8(0);
161
162 execute_window_loop(window, [&](const Coordinates & id)
163 {
164 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
165 const uint8_t *const in_ptr = in.ptr();
166
167 const size_t in_yi = (id.y() + 0.5f) * hr;
168 const size_t offset_row = in_yi * input_stride;
169
170 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
171 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
172 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
173 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
174 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
175 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
176 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
177 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
178 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
179 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
180 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
181 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
182 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
183 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
184 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
185 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
186
187 vst1q_u8(out.ptr(), tmp);
188 },
189 in, offsets, out);
190 break;
191 }
192 case DataType::S16:
193 {
194 int16x8x2_t tmp =
195 {
196 {
197 vdupq_n_s16(0),
198 vdupq_n_s16(0)
199 }
200 };
201
202 execute_window_loop(window, [&](const Coordinates & id)
203 {
204 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
205
206 const size_t in_yi = (id.y() + 0.5f) * hr;
207 const size_t offset_row = in_yi * input_stride;
208
209 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
210 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
211 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
212 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
213 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
214 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
215 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
216 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
217
218 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
219 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
220 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
221 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
222 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
223 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
224 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
225 tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
226
227 vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
228 },
229 in, offsets, out);
230 break;
231 }
232 default:
233 ARM_COMPUTE_ERROR("Not supported");
234 break;
235 }
236}
237
238void NEScaleKernel::scale_bilinear(const Window &window)
239{
240 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
241
242 // Compute the ratio between source height and destination height
243 const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
244
245 // Don't increment in X and Y direction for the input tensor
246 // A pointer to the start of this plane is needed as base for the precomputed offsets
247 Window win_in(window);
248 win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
249 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
250
251 Window win_off;
252 win_off.set(Window::DimX, window.x());
253 win_off.set(Window::DimY, window.y());
254
255 for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
256 {
257 win_off.set(d, Window::Dimension(0, 0, 0));
258 }
259
260 Iterator in(_input, win_in);
261 Iterator out(_output, window);
262 Iterator offsets(_offsets, win_off);
263 Iterator dx(_dx, win_off);
264 Iterator dy(_dy, win_off);
265
266 /* Input image stride */
267 const size_t in_stride = _input->info()->strides_in_bytes()[1];
268
269 execute_window_loop(window, [&](const Coordinates & id)
270 {
271 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
272 const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
273 const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
274 const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
275
276 const size_t in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
277 const size_t offset_row = in_yi * in_stride;
278
279 uint8x8_t tmp0 = vdup_n_u8(0);
280 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
281 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
282 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
283 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
284 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
285 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
286 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
287 tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
288
289 uint8x8_t tmp1 = vdup_n_u8(0);
290 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
291 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
292 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
293 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
294 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
295 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
296 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
297 tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
298
299 vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
300 },
301 in, offsets, dx, dy, out);
302}
303
304void NEScaleKernel::scale_area(const Window &window)
305{
306 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
307
308 // Don't increment in X and Y direction for the input tensor
309 // A pointer to the start of this plane is needed as base for the precomputed offsets
310 Window win_in(window);
311 win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
312 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
313
314 Iterator in(_input, win_in);
315 Iterator out(_output, window);
316
317 const auto wr = static_cast<float>(_input->info()->dimension(0)) / static_cast<float>(_output->info()->dimension(0));
318 const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
319 const auto w = _input->info()->dimension(0);
320 const auto h = _input->info()->dimension(1);
321 const size_t in_stride = _input->info()->strides_in_bytes()[1];
322
323 execute_window_loop(window, [&](const Coordinates & id)
324 {
325 const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
326
327 uint8x8_t tmp0 = vdup_n_u8(0);
328 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
329 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
330 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
331 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
332 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
333 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
334 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
335 tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
336
337 uint8x8_t tmp1 = vdup_n_u8(0);
338 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
339 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
340 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
341 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
342 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
343 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
344 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
345 tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
346
347 vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
348 },
349 in, out);
350}
351
352void NEScaleKernel::run(const Window &window)
353{
354 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
355 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
356 ARM_COMPUTE_ERROR_ON(_func == nullptr);
357
358 (this->*_func)(window);
359}