blob: 8d4e46500f55127a1f16c2f2eba764c171a3b301 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010032#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Utils.h"
35#include "arm_compute/core/Validate.h"
36#include "arm_compute/core/Window.h"
37
38#include <algorithm>
39#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010040#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010042#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <string>
44#include <tuple>
45
46using namespace arm_compute;
47
48namespace
49{
50inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
51 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
52{
Pablo Tello0c34fe22017-06-26 17:17:42 +010053 const int start_x = id.x() * stride_x - pad_x;
54 const int start_y = id.y() * stride_y - pad_y;
55 const int end_x = std::min(start_x + pool_size, upper_bound_w);
56 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010057 return 1.f / ((end_y - start_y) * (end_x - start_x));
58}
59
60inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
61 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
62{
Pablo Tello0c34fe22017-06-26 17:17:42 +010063 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010064 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
65 const int start_x = id.x() * stride_x - pad_x;
66 const int start_y = id.y() * stride_y - pad_y;
67 const int end_x = std::min(start_x + pool_size, upper_bound_w);
68 const int end_y = std::min(start_y + pool_size, upper_bound_h);
69 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010070 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
71}
72
73inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
74 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
75{
76 static std::array<qint16_t, 10> scale_values_q16 =
77 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
78 const int start_x = id.x() * stride_x - pad_x;
79 const int start_y = id.y() * stride_y - pad_y;
80 const int end_x = std::min(start_x + pool_size, upper_bound_w);
81 const int end_y = std::min(start_y + pool_size, upper_bound_h);
82 const int val = ((end_y - start_y) * (end_x - start_x));
83 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010084}
85} // namespace
86
87NEPoolingLayerKernel::NEPoolingLayerKernel()
88 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
89{
90}
91
92BorderSize NEPoolingLayerKernel::border_size() const
93{
94 return _border_size;
95}
96
97void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
98{
Gian Marco Iodice4e288692017-06-27 11:41:59 +010099 int pool_pad_x = 0;
100 int pool_pad_y = 0;
101 int pool_stride_x = 0;
102 int pool_stride_y = 0;
103 unsigned int pooled_w = 0;
104 unsigned int pooled_h = 0;
105 PoolingType pool_type = pool_info.pool_type();
106 int pool_size = pool_info.pool_size();
107 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
109 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
110
Gian Marco Iodice16824302017-09-28 15:41:37 +0100111 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100112 ARM_COMPUTE_UNUSED(supported_pool_sizes);
113
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100114 ARM_COMPUTE_ERROR_ON_NULLPTR(output);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100115 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100116 ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
Gian Marco Iodice16824302017-09-28 15:41:37 +0100117 ARM_COMPUTE_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->info()->data_type() != DataType::F32));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100118 ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100119 ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100120
121 // Check output dimensions
122 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
Gian Marco Iodice4e288692017-06-27 11:41:59 +0100123 pool_size, pool_size, pool_info.pad_stride_info());
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100124
125 // Output auto initialization if not yet initialized
126 {
127 TensorShape output_shape{ input->info()->tensor_shape() };
128 output_shape.set(0, pooled_w);
129 output_shape.set(1, pooled_h);
130
131 auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
132 }
133
134 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
135 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136 ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
137
138 unsigned int num_elems_read_per_iteration = 0;
139 unsigned int num_elems_processed_per_iteration = 0;
140 unsigned int num_elems_horizontal_window = 0;
141
142 // Select element size
143 switch(input->info()->data_type())
144 {
145 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100146 num_elems_read_per_iteration = 16;
147 switch(pool_size)
148 {
149 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100150 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100151 break;
152 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100153 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100154 break;
155 default:
156 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100157 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100158 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100159 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
160 break;
161 case DataType::QS16:
162 num_elems_read_per_iteration = 8;
163 switch(pool_size)
164 {
165 case 2:
166 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
167 break;
168 case 3:
169 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
170 break;
171 default:
172 ARM_COMPUTE_ERROR("Pooling size not supported");
173 }
174 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100175 break;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100176#ifdef ARM_COMPUTE_ENABLE_FP16
177 case DataType::F16:
178 switch(pool_size)
179 {
180 case 2:
181 num_elems_read_per_iteration = 16;
182 num_elems_processed_per_iteration = 8;
183 num_elems_horizontal_window = 8;
184 break;
185 case 3:
186 num_elems_read_per_iteration = 4;
187 num_elems_processed_per_iteration = 1;
188 num_elems_horizontal_window = 1;
189 break;
190 default:
191 ARM_COMPUTE_ERROR("Pooling size not supported");
192 break;
193 }
194 break;
195#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100196 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100197 switch(pool_size)
198 {
199 case 2:
200 num_elems_read_per_iteration = 2;
201 break;
202 case 3:
203 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
204 break;
205 case 7:
206 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
207 break;
208 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100209 num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
Pablo Tello0c34fe22017-06-26 17:17:42 +0100210 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100211 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100212 num_elems_processed_per_iteration = 1;
213 num_elems_horizontal_window = 1;
214 break;
215 default:
216 ARM_COMPUTE_ERROR("Element size not supported");
217 break;
218 }
219
220 _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
221 const int input_width = input->info()->dimension(0);
222 const int input_height = input->info()->dimension(1);
223 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
224 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
225
226 // Set instance variables
227 _input = input;
228 _output = output;
229 _pool_info = pool_info;
230 _border_size = BorderSize(pool_pad_y, pool_pad_x);
231 _border_size.right = std::max(upper_bound_w, pool_pad_x);
232 _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
233
234 // Select appropriate function
235 switch(pool_size)
236 {
237 case 2:
238 if(input->info()->data_type() == DataType::QS8)
239 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100240 switch(pool_type)
241 {
242 case PoolingType::AVG:
243 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
244 break;
245 case PoolingType::MAX:
246 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
247 break;
248 default:
249 ARM_COMPUTE_ERROR("Unsupported pooling type!");
250 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100252 else if(input->info()->data_type() == DataType::QS16)
253 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100254 switch(pool_type)
255 {
256 case PoolingType::AVG:
257 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
258 break;
259 case PoolingType::MAX:
260 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
261 break;
262 default:
263 ARM_COMPUTE_ERROR("Unsupported pooling type!");
264 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100265 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100266 else if(input->info()->data_type() == DataType::F16)
267 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100268 switch(pool_type)
269 {
270 case PoolingType::AVG:
271 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
272 break;
273 case PoolingType::L2:
274 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
275 break;
276 case PoolingType::MAX:
277 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
278 break;
279 default:
280 ARM_COMPUTE_ERROR("Unsupported pooling type!");
281 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100282 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283 else if(input->info()->data_type() == DataType::F32)
284 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100285 switch(pool_type)
286 {
287 case PoolingType::AVG:
288 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
289 break;
290 case PoolingType::L2:
291 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
292 break;
293 case PoolingType::MAX:
294 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
295 break;
296 default:
297 ARM_COMPUTE_ERROR("Unsupported pooling type!");
298 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100299 }
300 break;
301 case 3:
302 if(input->info()->data_type() == DataType::QS8)
303 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100304 switch(pool_type)
305 {
306 case PoolingType::AVG:
307 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
308 break;
309 case PoolingType::MAX:
310 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
311 break;
312 default:
313 ARM_COMPUTE_ERROR("Unsupported pooling type!");
314 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100316 else if(input->info()->data_type() == DataType::QS16)
317 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100318 switch(pool_type)
319 {
320 case PoolingType::AVG:
321 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
322 break;
323 case PoolingType::MAX:
324 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
325 break;
326 default:
327 ARM_COMPUTE_ERROR("Unsupported pooling type!");
328 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100329 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100330 else if(input->info()->data_type() == DataType::F16)
331 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100332 switch(pool_type)
333 {
334 case PoolingType::AVG:
335 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
336 break;
337 case PoolingType::L2:
338 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
339 break;
340 case PoolingType::MAX:
341 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
342 break;
343 default:
344 ARM_COMPUTE_ERROR("Unsupported pooling type!");
345 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100346 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100347 else if(input->info()->data_type() == DataType::F32)
348 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100349 switch(pool_type)
350 {
351 case PoolingType::AVG:
352 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
353 break;
354 case PoolingType::L2:
355 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
356 break;
357 case PoolingType::MAX:
358 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
359 break;
360 default:
361 ARM_COMPUTE_ERROR("Unsupported pooling type!");
362 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363 }
364 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100365 case 7:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100366 switch(pool_type)
367 {
368 case PoolingType::AVG:
369 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
370 break;
371 case PoolingType::L2:
372 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
373 break;
374 case PoolingType::MAX:
375 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
376 break;
377 default:
378 ARM_COMPUTE_ERROR("Unsupported pooling type!");
379 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100380 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100381 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100382 switch(pool_type)
383 {
384 case PoolingType::AVG:
385 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG>;
386 break;
387 case PoolingType::L2:
388 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2>;
389 break;
390 case PoolingType::MAX:
391 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX>;
392 break;
393 default:
394 ARM_COMPUTE_ERROR("Unsupported pooling type!");
395 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100396 break;
397 }
398
399 // Configure kernel window
400 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
401 AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
402 AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
403 update_window_and_padding(win, input_access, output_access);
404 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
405 INEKernel::configure(win);
406}
407
408template <PoolingType pooling_type>
409void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
410{
411 Iterator input(_input, window_input);
412 Iterator output(_output, window);
413
414 const int fixed_point_position = _input->info()->fixed_point_position();
415 constexpr int pool_size = 2;
416 int pool_pad_x = 0;
417 int pool_pad_y = 0;
418 int pool_stride_x = 0;
419 int pool_stride_y = 0;
420 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
421 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
422 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
423 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
424
425 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
426 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
427
428 execute_window_loop(window, [&](const Coordinates & id)
429 {
430 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
431 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100432 qint8x8_t lower_res = {};
433 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100434 if(pooling_type == PoolingType::AVG)
435 {
436 // Calculate scale
437 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
438 const qint8x8_t scale_vec = vdup_n_qs8(scale);
439
440 // Perform pooling
441 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100442 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
443 if(pool_stride_x == 1)
444 {
445 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
446 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
447 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100448 }
449 else
450 {
451 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100452 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
453 if(pool_stride_x == 1)
454 {
455 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
456 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
457 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100459 if(pool_stride_x == 1)
460 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100461 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100462 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
463 }
464 else
465 {
466 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
467 }
468 },
469 input, output);
470}
471
472template <PoolingType pooling_type>
473void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
474{
475 Iterator input(_input, window_input);
476 Iterator output(_output, window);
477
478 const int fixed_point_position = _input->info()->fixed_point_position();
479 constexpr int pool_size = 2;
480 int pool_pad_x = 0;
481 int pool_pad_y = 0;
482 int pool_stride_x = 0;
483 int pool_stride_y = 0;
484 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
485 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
486 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
487 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
488
489 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
490 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
491
492 execute_window_loop(window, [&](const Coordinates & id)
493 {
494 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
495 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
496 qint16x4_t lower_res = {};
497 qint16x4_t upper_res = {};
498 if(pooling_type == PoolingType::AVG)
499 {
500 // Calculate scale
501 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
502 const qint16x4_t scale_vec = vdup_n_qs16(scale);
503
504 // Perform pooling
505 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
506 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
507 if(pool_stride_x == 1)
508 {
509 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
510 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
511 }
512 }
513 else
514 {
515 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
516 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
517 if(pool_stride_x == 1)
518 {
519 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
520 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
521 }
522 }
523 if(pool_stride_x == 1)
524 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100525 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100526 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
527 }
528 else
529 {
530 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
531 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100532 },
533 input, output);
534}
535
536template <PoolingType pooling_type>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100537void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
538{
539#ifdef ARM_COMPUTE_ENABLE_FP16
540 Iterator input(_input, window_input);
541 Iterator output(_output, window);
542
543 constexpr const int pool_size = 3;
544 int pool_pad_x = 0;
545 int pool_pad_y = 0;
546 int pool_stride_x = 0;
547 int pool_stride_y = 0;
548 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
549 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
550 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
551 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
552
553 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
554 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
555 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
556
557 execute_window_loop(window, [&](const Coordinates & id)
558 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100559 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
560 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
561 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
562 float16x4_t res = {};
563
564 // Get power of 2 in case of l2 pooling
565 if(pooling_type == PoolingType::L2)
566 {
567 top_data = vmul_f16(top_data, top_data);
568 middle_data = vmul_f16(middle_data, middle_data);
569 bottom_data = vmul_f16(bottom_data, bottom_data);
570 }
571
572 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100573 {
574 // Calculate scale
575 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
576 const float16x4_t scale_v = vdup_n_f16(scale);
577 // Perform pooling
578 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
579 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
580 res = vmul_f16(vpadd_f16(res, res), scale_v);
581 }
582 else
583 {
584 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
585 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
586 res = vpmax_f16(res, res);
587 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100588
589 // Calculate square-root in case of l2 pooling
590 if(pooling_type == PoolingType::L2)
591 {
592 res = vinv_f16(vinvsqrt_f16(res));
593 }
594
Pablo Tello0c34fe22017-06-26 17:17:42 +0100595 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
596 },
597 input, output);
598#else /* ARM_COMPUTE_ENABLE_FP16 */
599 ARM_COMPUTE_UNUSED(window_input);
600 ARM_COMPUTE_UNUSED(window);
601 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
602#endif /* ARM_COMPUTE_ENABLE_FP16 */
603}
604
605template <PoolingType pooling_type>
606void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
607{
608#ifdef ARM_COMPUTE_ENABLE_FP16
609 Iterator input(_input, window_input);
610 Iterator output(_output, window);
611 constexpr int pool_size = 2;
612 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
613 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
614 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
615 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
616 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
617
618 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
619 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
620
621 execute_window_loop(window, [&](const Coordinates & id)
622 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100623 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
624 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100625 float16x8_t res = {};
626
Georgios Pinitascdf51452017-08-31 14:21:36 +0100627 // Get power of 2 in case of l2 pooling
628 if(pooling_type == PoolingType::L2)
629 {
630 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
631 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
632 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
633 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
634 }
635
636 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100637 {
638 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
639 const float16x8_t scale_v = vdupq_n_f16(scale);
640 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
641 }
642 else
643 {
644 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
645 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100646
647 // Calculate square-root in case of l2 pooling
648 if(pooling_type == PoolingType::L2)
649 {
650 res = vinvq_f16(vinvsqrtq_f16(res));
651 }
652
653 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100654 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
655 },
656 input, output);
657#else /* ARM_COMPUTE_ENABLE_FP16 */
658 ARM_COMPUTE_UNUSED(window_input);
659 ARM_COMPUTE_UNUSED(window);
660 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
661#endif /* ARM_COMPUTE_ENABLE_FP16 */
662}
663
664template <PoolingType pooling_type>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100665void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
666{
667 Iterator input(_input, window_input);
668 Iterator output(_output, window);
669
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100670 constexpr int pool_size = 2;
671 int pool_pad_x = 0;
672 int pool_pad_y = 0;
673 int pool_stride_x = 0;
674 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100675 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
676 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
677 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
678 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
679
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100680 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
681 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100682
683 execute_window_loop(window, [&](const Coordinates & id)
684 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100685 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
686 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
687 float32x2_t res = {};
688 float final_res = 0;
689
690 // Get power of 2 in case of l2 pooling
691 if(pooling_type == PoolingType::L2)
692 {
693 top_data = vmul_f32(top_data, top_data);
694 bottom_data = vmul_f32(bottom_data, bottom_data);
695 }
696
697 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100698 {
699 // Calculate scale
700 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
701 const float32x2_t scale_v = vdup_n_f32(scale);
702
703 // Perform pooling
704 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
705 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
706 }
707 else
708 {
709 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
710 res = vpmax_f32(max_data, max_data);
711 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100712 final_res = vget_lane_f32(res, 0);
713
714 // Calculate square-root in case of l2 pooling
715 if(pooling_type == PoolingType::L2)
716 {
717 final_res = sqrt(final_res);
718 }
719
720 // Store result
721 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100722 },
723 input, output);
724}
725
726template <PoolingType pooling_type>
727void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
728{
729 Iterator input(_input, window_input);
730 Iterator output(_output, window);
731
732 const int fixed_point_position = _input->info()->fixed_point_position();
733 constexpr int pool_size = 3;
734 int pool_pad_x = 0;
735 int pool_pad_y = 0;
736 int pool_stride_x = 0;
737 int pool_stride_y = 0;
738 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
739 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
740 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
741 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
742
743 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
744 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
745 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
746
747 execute_window_loop(window, [&](const Coordinates & id)
748 {
749 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
750 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
751 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
752 qint8x8_t res = {};
753 if(pooling_type == PoolingType::AVG)
754 {
755 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100756 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100757
758 // Perform pooling for stride 2
759 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
760 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
761 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
762 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
763 if(pool_stride_x == 2)
764 {
765 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
766 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100767 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100768 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100769 res = vqmul_qs8(res, scale_vec, fixed_point_position);
770 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100771 }
772 else
773 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100774 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
775 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100776 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100777 }
778 else
779 {
780 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
781 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
782 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
783 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
784
785 if(pool_stride_x == 2)
786 {
787 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
788 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
789 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100790 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100791 }
792 else
793 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100794 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100795 }
796 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100797 },
798 input, output);
799}
800
801template <PoolingType pooling_type>
802void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
803{
804 Iterator input(_input, window_input);
805 Iterator output(_output, window);
806
807 const int fixed_point_position = _input->info()->fixed_point_position();
808 constexpr int pool_size = 3;
809 int pool_pad_x = 0;
810 int pool_pad_y = 0;
811 int pool_stride_x = 0;
812 int pool_stride_y = 0;
813 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
814 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
815 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
816 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
817
818 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
819 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
820 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
821
822 execute_window_loop(window, [&](const Coordinates & id)
823 {
824 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
825 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
826 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
827
828 if(pooling_type == PoolingType::AVG)
829 {
830 // Calculate scale
831 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
832
833 // Perform pooling for stride 2
834 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
835 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
836 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
837 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
838 if(pool_stride_x == 2)
839 {
840 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
841 const qint16x4_t scale_vec = vdup_n_qs16(scale);
842 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
843 }
844 else
845 {
846 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
847 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
848 }
849 }
850 else
851 {
852 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
853 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
854 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
855 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
856
857 if(pool_stride_x == 2)
858 {
859 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
860 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
861 }
862 else
863 {
864 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
865 }
866 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100867 },
868 input, output);
869}
870
871template <PoolingType pooling_type>
872void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
873{
874 Iterator input(_input, window_input);
875 Iterator output(_output, window);
876
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100877 constexpr const int pool_size = 3;
878 int pool_pad_x = 0;
879 int pool_pad_y = 0;
880 int pool_stride_x = 0;
881 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100882 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
883 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
884 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
885 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
886
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100887 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
888 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
889 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100890
891 execute_window_loop(window, [&](const Coordinates & id)
892 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100893 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
894 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
895 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
896 float32x2_t res = {};
897 float final_res = 0;
898
899 // Get power of 2 in case of l2 pooling
900 if(pooling_type == PoolingType::L2)
901 {
902 top_data = vmulq_f32(top_data, top_data);
903 middle_data = vmulq_f32(middle_data, middle_data);
904 bottom_data = vmulq_f32(bottom_data, bottom_data);
905 }
906
907 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100908 {
909 // Calculate scale
910 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
911 const float32x2_t scale_v = vdup_n_f32(scale);
912
913 // Perform pooling
914 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
915 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
916 res = vmul_f32(vpadd_f32(res, res), scale_v);
917 }
918 else
919 {
920 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
921 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
922 res = vpmax_f32(res, res);
923 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100924 final_res = vget_lane_f32(res, 0);
925
926 // Calculate square-root in case of l2 pooling
927 if(pooling_type == PoolingType::L2)
928 {
929 final_res = sqrt(final_res);
930 }
931
932 // Store result
933 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100934 },
935 input, output);
936}
937
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100938template <PoolingType pooling_type>
939void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
940{
941 Iterator input(_input, window_input);
942 Iterator output(_output, window);
943
944 constexpr const int pool_size = 7;
945 int pool_pad_x = 0;
946 int pool_pad_y = 0;
947 int pool_stride_x = 0;
948 int pool_stride_y = 0;
949 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
950 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
951 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
952 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
953
954 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
955 for(int i = 0; i < pool_size; ++i)
956 {
957 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
958 }
959
960 execute_window_loop(window, [&](const Coordinates & id)
961 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100962 float32x2_t res = {};
963 float final_res = 0.f;
964 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100965 {
966 // Calculate scale
967 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
968 const float32x2_t scale_v = vdup_n_f32(scale);
969
970 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +0100971 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
972 // Get power of 2 in case of l2 pooling
973 if(pooling_type == PoolingType::L2)
974 {
975 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
976 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
977 }
978 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100979 for(int i = 1; i < pool_size; ++i)
980 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100981 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
982 // Get power of 2 in case of l2 pooling
983 if(pooling_type == PoolingType::L2)
984 {
985 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
986 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
987 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100988 sum_data = vaddq_f32(sum_data, data.val[0]);
989 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
990 }
991 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
992 res = vmul_f32(vpadd_f32(res, res), scale_v);
993 }
994 else
995 {
996 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
997 for(int i = 1; i < pool_size; ++i)
998 {
999 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1000 max_data = vmax2q_f32(max_data, data);
1001 }
1002 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1003 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1004 res = vpmax_f32(res, res);
1005 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001006 final_res = vget_lane_f32(res, 0);
1007
1008 // Calculate square-root in case of l2 pooling
1009 if(pooling_type == PoolingType::L2)
1010 {
1011 final_res = sqrt(final_res);
1012 }
1013
1014 // Store result
1015 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001016 },
1017 input, output);
1018}
1019
Gian Marco Iodice16824302017-09-28 15:41:37 +01001020template <PoolingType pooling_type>
1021void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
1022{
1023 Iterator input(_input, window_input);
1024 Iterator output(_output, window);
1025
1026 const int pool_size = _pool_info.pool_size();
1027 int pool_pad_x = 0;
1028 int pool_pad_y = 0;
1029 int pool_stride_x = 0;
1030 int pool_stride_y = 0;
1031 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1032 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1033 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
1034 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
1035
1036 execute_window_loop(window, [&](const Coordinates & id)
1037 {
1038 float res = 0.0f;
1039
1040 if(pooling_type != PoolingType::MAX)
1041 {
1042 // Calculate scale
1043 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1044
1045 // Perform pooling
1046 float32x4_t vres = vdupq_n_f32(0.0f);
1047
1048 for(int y = 0; y < pool_size; ++y)
1049 {
1050 int x = 0;
1051 for(; x <= (pool_size - 4); x += 4)
1052 {
1053 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1054 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1055
1056 // Get power of 2 in case of l2 pooling and accumulate
1057 if(pooling_type == PoolingType::L2)
1058 {
1059 vres = vmlaq_f32(vres, data, data);
1060 }
1061 else
1062 {
1063 vres = vaddq_f32(vres, data);
1064 }
1065 }
1066
1067 // Leftover for loop
1068 for(; x < pool_size; ++x)
1069 {
1070 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1071
1072 // Get power of 2 in case of l2 pooling
1073 if(pooling_type == PoolingType::L2)
1074 {
1075 data *= data;
1076 }
1077
1078 res += data;
1079 }
1080 }
1081
1082#if defined(__aarch64__)
1083 // Reduction operation available on 64 bit architectures only
1084 res += vaddvq_f32(vres);
1085#else // __aarch64__
1086 // Reduction
1087 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1088 tmp = vpadd_f32(tmp, tmp);
1089
1090 res += vget_lane_f32(tmp, 0);
1091#endif // __aarch64__
1092 // Divide by scale
1093 res *= scale;
1094 }
1095 else
1096 {
1097 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
1098 res = std::numeric_limits<float>::min();
1099
1100 for(int y = 0; y < pool_size; ++y)
1101 {
1102 int x = 0;
1103 for(; x <= (pool_size - 4); x += 4)
1104 {
1105 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1106 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1107 vres = vmaxq_f32(vres, data);
1108 }
1109
1110 // Leftover for loop
1111 for(; x < pool_size; ++x)
1112 {
1113 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1114 res = std::max(res, data);
1115 }
1116 }
1117
1118#if defined(__aarch64__)
1119 // Reduction operation available on 64 bit architectures only
1120 res = std::max(vmaxvq_f32(vres), res);
1121#else // __aarch64__
1122 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1123 tmp = vpmax_f32(tmp, tmp);
1124
1125 res = std::max(res, vget_lane_f32(tmp, 0));
1126#endif // __aarch64__
1127 }
1128
1129 // Calculate square-root in case of l2 pooling
1130 if(pooling_type == PoolingType::L2)
1131 {
1132 res = std::sqrt(res);
1133 }
1134
1135 // Store result
1136 *(reinterpret_cast<float *>(output.ptr())) = res;
1137 },
1138 input, output);
1139}
1140
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001141void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001142{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001143 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001144 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1145 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1146 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1147
Pablo Tello0c34fe22017-06-26 17:17:42 +01001148 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1149 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001150
1151 // Set step for input in x and y direction for the input
1152 Window window_input(window);
1153 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001154 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001155 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001156 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001157 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001158 case DataType::F16:
1159 {
1160 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1161 break;
1162 }
1163 case DataType::F32:
1164 {
1165 window_x_inc = pool_stride_x;
1166 break;
1167 }
1168 default:
1169 {
1170 ARM_COMPUTE_ERROR("Not supported");
1171 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001172 }
1173 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1174 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1175
1176 // Run function
1177 (this->*_func)(window_input, window);
1178}