blob: 30b67b64b96c8fa0b6640a2eb00772ea9ebecaf7 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
32#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Utils.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36
37#include <algorithm>
38#include <arm_neon.h>
39#include <limits>
40#include <string>
41#include <tuple>
42
43using namespace arm_compute;
44
45namespace
46{
47inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
48 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
49{
50 int start_x = id.x() * stride_x - pad_x;
51 int start_y = id.y() * stride_y - pad_y;
52 int end_x = std::min(start_x + pool_size, upper_bound_w);
53 int end_y = std::min(start_y + pool_size, upper_bound_h);
54 return 1.f / ((end_y - start_y) * (end_x - start_x));
55}
56
57inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
58 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
59{
60 static std::array<qint8_t, 10> scale_values_q8 =
61 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
62 const int start_x = id.x() * stride_x - pad_x;
63 const int start_y = id.y() * stride_y - pad_y;
64 const int end_x = std::min(start_x + pool_size, upper_bound_w);
65 const int end_y = std::min(start_y + pool_size, upper_bound_h);
66 const int val = ((end_y - start_y) * (end_x - start_x));
67 return scale_values_q8[val] >> (7 - fixed_point_position);
68}
69} // namespace
70
71NEPoolingLayerKernel::NEPoolingLayerKernel()
72 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
73{
74}
75
76BorderSize NEPoolingLayerKernel::border_size() const
77{
78 return _border_size;
79}
80
81void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
82{
83 int pool_pad_x = 0;
84 int pool_pad_y = 0;
85 int pool_stride_x = 0;
86 int pool_stride_y = 0;
87 unsigned int pooled_w = 0;
88 unsigned int pooled_h = 0;
89 PoolingType pool_type = pool_info.pool_type();
90 int pool_size = pool_info.pool_size();
91 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
92 DimensionRoundingType pool_round = pad_stride_info.round();
93 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
94 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
95
96 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
97 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
98 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
99 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
100 ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
101 ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
102 ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
103 ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
104
105 // Check output dimensions
106 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
107 pool_size, pool_stride_x, pool_stride_y,
108 pool_pad_x, pool_pad_y, pool_round);
109 ARM_COMPUTE_UNUSED(pooled_w);
110 ARM_COMPUTE_UNUSED(pooled_h);
111 ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
112
113 unsigned int num_elems_read_per_iteration = 0;
114 unsigned int num_elems_processed_per_iteration = 0;
115 unsigned int num_elems_horizontal_window = 0;
116
117 // Select element size
118 switch(input->info()->data_type())
119 {
120 case DataType::QS8:
121 num_elems_read_per_iteration = 16;
122 num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
123 num_elems_horizontal_window = 8;
124 break;
125 case DataType::F32:
126 num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
127 num_elems_processed_per_iteration = 1;
128 num_elems_horizontal_window = 1;
129 break;
130 default:
131 ARM_COMPUTE_ERROR("Element size not supported");
132 break;
133 }
134
135 _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
136 const int input_width = input->info()->dimension(0);
137 const int input_height = input->info()->dimension(1);
138 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
139 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
140
141 // Set instance variables
142 _input = input;
143 _output = output;
144 _pool_info = pool_info;
145 _border_size = BorderSize(pool_pad_y, pool_pad_x);
146 _border_size.right = std::max(upper_bound_w, pool_pad_x);
147 _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
148
149 // Select appropriate function
150 switch(pool_size)
151 {
152 case 2:
153 if(input->info()->data_type() == DataType::QS8)
154 {
155 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
156 }
157 else if(input->info()->data_type() == DataType::F32)
158 {
159 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
160 }
161 break;
162 case 3:
163 if(input->info()->data_type() == DataType::QS8)
164 {
165 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
166 }
167 else if(input->info()->data_type() == DataType::F32)
168 {
169 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
170 }
171 break;
172 default:
173 ARM_COMPUTE_ERROR("Unsupported pooling size");
174 break;
175 }
176
177 // Configure kernel window
178 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
179 AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
180 AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
181 update_window_and_padding(win, input_access, output_access);
182 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
183 INEKernel::configure(win);
184}
185
186template <PoolingType pooling_type>
187void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
188{
189 Iterator input(_input, window_input);
190 Iterator output(_output, window);
191
192 const int fixed_point_position = _input->info()->fixed_point_position();
193 constexpr int pool_size = 2;
194 int pool_pad_x = 0;
195 int pool_pad_y = 0;
196 int pool_stride_x = 0;
197 int pool_stride_y = 0;
198 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
199 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
200 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
201 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
202
203 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
204 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
205
206 execute_window_loop(window, [&](const Coordinates & id)
207 {
208 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
209 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
210 qint8x8_t res = {};
211 if(pooling_type == PoolingType::AVG)
212 {
213 // Calculate scale
214 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
215 const qint8x8_t scale_vec = vdup_n_qs8(scale);
216
217 // Perform pooling
218 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
219 res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
220 }
221 else
222 {
223 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
224 res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
225 }
226 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
227 },
228 input, output);
229}
230
231template <PoolingType pooling_type>
232void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
233{
234 Iterator input(_input, window_input);
235 Iterator output(_output, window);
236
237 constexpr int pool_size = 2;
238 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
239 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
240 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
241 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
242 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
243
244 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
245 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
246
247 execute_window_loop(window, [&](const Coordinates & id)
248 {
249 const float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
250 const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
251 float32x2_t res = {};
252 if(pooling_type == PoolingType::AVG)
253 {
254 // Calculate scale
255 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
256 const float32x2_t scale_v = vdup_n_f32(scale);
257
258 // Perform pooling
259 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
260 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
261 }
262 else
263 {
264 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
265 res = vpmax_f32(max_data, max_data);
266 }
267 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
268 },
269 input, output);
270}
271
272template <PoolingType pooling_type>
273void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
274{
275 Iterator input(_input, window_input);
276 Iterator output(_output, window);
277
278 const int fixed_point_position = _input->info()->fixed_point_position();
279 constexpr int pool_size = 3;
280 int pool_pad_x = 0;
281 int pool_pad_y = 0;
282 int pool_stride_x = 0;
283 int pool_stride_y = 0;
284 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
285 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
286 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
287 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
288
289 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
290 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
291 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
292
293 execute_window_loop(window, [&](const Coordinates & id)
294 {
295 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
296 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
297 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
298 qint8x8_t res = {};
299 if(pooling_type == PoolingType::AVG)
300 {
301 // Calculate scale
302 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
303 const qint8x8_t scale_vec = vdup_n_qs8(scale);
304
305 // Perform pooling for stride 2
306 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
307 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
308 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
309 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
310 if(pool_stride_x == 2)
311 {
312 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
313 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
314 res = vtbl2_s8(table, lookup_val);
315 }
316 else
317 {
318 res = vget_low_s8(final_sum);
319 }
320 res = vqmul_qs8(res, scale_vec, fixed_point_position);
321 }
322 else
323 {
324 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
325 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
326 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
327 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
328
329 if(pool_stride_x == 2)
330 {
331 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
332 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
333 res = vtbl2_s8(table, lookup_val);
334 }
335 else
336 {
337 res = vget_low_s8(final_max);
338 }
339 }
340 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
341 },
342 input, output);
343}
344
345template <PoolingType pooling_type>
346void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
347{
348 Iterator input(_input, window_input);
349 Iterator output(_output, window);
350
351 constexpr const int pool_size = 3;
352 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
353 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
354 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
355 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
356 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
357
358 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
359 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
360 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
361
362 execute_window_loop(window, [&](const Coordinates & id)
363 {
364 const float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
365 const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
366 const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
367 float32x2_t res = {};
368 if(pooling_type == PoolingType::AVG)
369 {
370 // Calculate scale
371 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
372 const float32x2_t scale_v = vdup_n_f32(scale);
373
374 // Perform pooling
375 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
376 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
377 res = vmul_f32(vpadd_f32(res, res), scale_v);
378 }
379 else
380 {
381 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
382 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
383 res = vpmax_f32(res, res);
384 }
385 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
386 },
387 input, output);
388}
389
390void NEPoolingLayerKernel::run(const Window &window)
391{
392 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
393 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
394 ARM_COMPUTE_ERROR_ON(_func == nullptr);
395
396 unsigned int pool_stride_x, pool_stride_y = 0;
397 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
398
399 // Set step for input in x and y direction for the input
400 Window window_input(window);
401 unsigned int window_x_inc = 0;
402 if(_input->info()->data_type() == DataType::QS8)
403 {
404 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
405 }
406 else
407 {
408 window_x_inc = pool_stride_x;
409 }
410 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
411 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
412
413 // Run function
414 (this->*_func)(window_input, window);
415}