blob: b97564e77b2cc9407b9d8e104003822e62e89d0e [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010032#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Utils.h"
35#include "arm_compute/core/Validate.h"
36#include "arm_compute/core/Window.h"
37
38#include <algorithm>
39#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010040#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010042#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <string>
44#include <tuple>
45
46using namespace arm_compute;
47
48namespace
49{
50inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
51 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
52{
Pablo Tello0c34fe22017-06-26 17:17:42 +010053 const int start_x = id.x() * stride_x - pad_x;
54 const int start_y = id.y() * stride_y - pad_y;
55 const int end_x = std::min(start_x + pool_size, upper_bound_w);
56 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010057 return 1.f / ((end_y - start_y) * (end_x - start_x));
58}
59
60inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
61 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
62{
Pablo Tello0c34fe22017-06-26 17:17:42 +010063 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010064 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
65 const int start_x = id.x() * stride_x - pad_x;
66 const int start_y = id.y() * stride_y - pad_y;
67 const int end_x = std::min(start_x + pool_size, upper_bound_w);
68 const int end_y = std::min(start_y + pool_size, upper_bound_h);
69 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010070 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
71}
72
73inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
74 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
75{
76 static std::array<qint16_t, 10> scale_values_q16 =
77 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
78 const int start_x = id.x() * stride_x - pad_x;
79 const int start_y = id.y() * stride_y - pad_y;
80 const int end_x = std::min(start_x + pool_size, upper_bound_w);
81 const int end_y = std::min(start_y + pool_size, upper_bound_h);
82 const int val = ((end_y - start_y) * (end_x - start_x));
83 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010084}
85} // namespace
86
87NEPoolingLayerKernel::NEPoolingLayerKernel()
88 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
89{
90}
91
92BorderSize NEPoolingLayerKernel::border_size() const
93{
94 return _border_size;
95}
96
97void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
98{
Gian Marco Iodice4e288692017-06-27 11:41:59 +010099 int pool_pad_x = 0;
100 int pool_pad_y = 0;
101 int pool_stride_x = 0;
102 int pool_stride_y = 0;
103 unsigned int pooled_w = 0;
104 unsigned int pooled_h = 0;
105 PoolingType pool_type = pool_info.pool_type();
106 int pool_size = pool_info.pool_size();
107 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
109 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
110
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100111 static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
112 ARM_COMPUTE_UNUSED(supported_pool_sizes);
113
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100114 ARM_COMPUTE_ERROR_ON_NULLPTR(output);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100115 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100116 ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100117 ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
118 ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100119 ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100120 ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100121
122 // Check output dimensions
123 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
Gian Marco Iodice4e288692017-06-27 11:41:59 +0100124 pool_size, pool_size, pool_info.pad_stride_info());
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100125
126 // Output auto initialization if not yet initialized
127 {
128 TensorShape output_shape{ input->info()->tensor_shape() };
129 output_shape.set(0, pooled_w);
130 output_shape.set(1, pooled_h);
131
132 auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
133 }
134
135 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
136 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100137 ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
138
139 unsigned int num_elems_read_per_iteration = 0;
140 unsigned int num_elems_processed_per_iteration = 0;
141 unsigned int num_elems_horizontal_window = 0;
142
143 // Select element size
144 switch(input->info()->data_type())
145 {
146 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100147 num_elems_read_per_iteration = 16;
148 switch(pool_size)
149 {
150 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100151 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100152 break;
153 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100154 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100155 break;
156 default:
157 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100158 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100159 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100160 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
161 break;
162 case DataType::QS16:
163 num_elems_read_per_iteration = 8;
164 switch(pool_size)
165 {
166 case 2:
167 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
168 break;
169 case 3:
170 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
171 break;
172 default:
173 ARM_COMPUTE_ERROR("Pooling size not supported");
174 }
175 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100176 break;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100177#ifdef ARM_COMPUTE_ENABLE_FP16
178 case DataType::F16:
179 switch(pool_size)
180 {
181 case 2:
182 num_elems_read_per_iteration = 16;
183 num_elems_processed_per_iteration = 8;
184 num_elems_horizontal_window = 8;
185 break;
186 case 3:
187 num_elems_read_per_iteration = 4;
188 num_elems_processed_per_iteration = 1;
189 num_elems_horizontal_window = 1;
190 break;
191 default:
192 ARM_COMPUTE_ERROR("Pooling size not supported");
193 break;
194 }
195 break;
196#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100198 switch(pool_size)
199 {
200 case 2:
201 num_elems_read_per_iteration = 2;
202 break;
203 case 3:
204 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
205 break;
206 case 7:
207 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
208 break;
209 default:
210 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100211 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100212 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100213 num_elems_processed_per_iteration = 1;
214 num_elems_horizontal_window = 1;
215 break;
216 default:
217 ARM_COMPUTE_ERROR("Element size not supported");
218 break;
219 }
220
221 _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
222 const int input_width = input->info()->dimension(0);
223 const int input_height = input->info()->dimension(1);
224 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
225 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
226
227 // Set instance variables
228 _input = input;
229 _output = output;
230 _pool_info = pool_info;
231 _border_size = BorderSize(pool_pad_y, pool_pad_x);
232 _border_size.right = std::max(upper_bound_w, pool_pad_x);
233 _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
234
235 // Select appropriate function
236 switch(pool_size)
237 {
238 case 2:
239 if(input->info()->data_type() == DataType::QS8)
240 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100241 switch(pool_type)
242 {
243 case PoolingType::AVG:
244 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
245 break;
246 case PoolingType::MAX:
247 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
248 break;
249 default:
250 ARM_COMPUTE_ERROR("Unsupported pooling type!");
251 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100252 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100253 else if(input->info()->data_type() == DataType::QS16)
254 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100255 switch(pool_type)
256 {
257 case PoolingType::AVG:
258 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
259 break;
260 case PoolingType::MAX:
261 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
262 break;
263 default:
264 ARM_COMPUTE_ERROR("Unsupported pooling type!");
265 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100266 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100267 else if(input->info()->data_type() == DataType::F16)
268 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100269 switch(pool_type)
270 {
271 case PoolingType::AVG:
272 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
273 break;
274 case PoolingType::L2:
275 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
276 break;
277 case PoolingType::MAX:
278 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
279 break;
280 default:
281 ARM_COMPUTE_ERROR("Unsupported pooling type!");
282 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100283 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100284 else if(input->info()->data_type() == DataType::F32)
285 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100286 switch(pool_type)
287 {
288 case PoolingType::AVG:
289 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
290 break;
291 case PoolingType::L2:
292 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
293 break;
294 case PoolingType::MAX:
295 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
296 break;
297 default:
298 ARM_COMPUTE_ERROR("Unsupported pooling type!");
299 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100300 }
301 break;
302 case 3:
303 if(input->info()->data_type() == DataType::QS8)
304 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100305 switch(pool_type)
306 {
307 case PoolingType::AVG:
308 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
309 break;
310 case PoolingType::MAX:
311 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
312 break;
313 default:
314 ARM_COMPUTE_ERROR("Unsupported pooling type!");
315 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100316 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100317 else if(input->info()->data_type() == DataType::QS16)
318 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100319 switch(pool_type)
320 {
321 case PoolingType::AVG:
322 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
323 break;
324 case PoolingType::MAX:
325 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
326 break;
327 default:
328 ARM_COMPUTE_ERROR("Unsupported pooling type!");
329 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100330 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100331 else if(input->info()->data_type() == DataType::F16)
332 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100333 switch(pool_type)
334 {
335 case PoolingType::AVG:
336 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
337 break;
338 case PoolingType::L2:
339 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
340 break;
341 case PoolingType::MAX:
342 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
343 break;
344 default:
345 ARM_COMPUTE_ERROR("Unsupported pooling type!");
346 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100347 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100348 else if(input->info()->data_type() == DataType::F32)
349 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100350 switch(pool_type)
351 {
352 case PoolingType::AVG:
353 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
354 break;
355 case PoolingType::L2:
356 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
357 break;
358 case PoolingType::MAX:
359 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
360 break;
361 default:
362 ARM_COMPUTE_ERROR("Unsupported pooling type!");
363 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364 }
365 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100366 case 7:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100367 switch(pool_type)
368 {
369 case PoolingType::AVG:
370 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
371 break;
372 case PoolingType::L2:
373 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
374 break;
375 case PoolingType::MAX:
376 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
377 break;
378 default:
379 ARM_COMPUTE_ERROR("Unsupported pooling type!");
380 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100381 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100382 default:
383 ARM_COMPUTE_ERROR("Unsupported pooling size");
384 break;
385 }
386
387 // Configure kernel window
388 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
389 AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
390 AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
391 update_window_and_padding(win, input_access, output_access);
392 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
393 INEKernel::configure(win);
394}
395
396template <PoolingType pooling_type>
397void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
398{
399 Iterator input(_input, window_input);
400 Iterator output(_output, window);
401
402 const int fixed_point_position = _input->info()->fixed_point_position();
403 constexpr int pool_size = 2;
404 int pool_pad_x = 0;
405 int pool_pad_y = 0;
406 int pool_stride_x = 0;
407 int pool_stride_y = 0;
408 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
409 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
410 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
411 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
412
413 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
414 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
415
416 execute_window_loop(window, [&](const Coordinates & id)
417 {
418 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
419 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100420 qint8x8_t lower_res = {};
421 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100422 if(pooling_type == PoolingType::AVG)
423 {
424 // Calculate scale
425 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
426 const qint8x8_t scale_vec = vdup_n_qs8(scale);
427
428 // Perform pooling
429 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100430 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
431 if(pool_stride_x == 1)
432 {
433 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
434 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
435 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100436 }
437 else
438 {
439 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100440 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
441 if(pool_stride_x == 1)
442 {
443 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
444 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
445 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100446 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100447 if(pool_stride_x == 1)
448 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100449 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100450 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
451 }
452 else
453 {
454 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
455 }
456 },
457 input, output);
458}
459
460template <PoolingType pooling_type>
461void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
462{
463 Iterator input(_input, window_input);
464 Iterator output(_output, window);
465
466 const int fixed_point_position = _input->info()->fixed_point_position();
467 constexpr int pool_size = 2;
468 int pool_pad_x = 0;
469 int pool_pad_y = 0;
470 int pool_stride_x = 0;
471 int pool_stride_y = 0;
472 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
473 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
474 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
475 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
476
477 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
478 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
479
480 execute_window_loop(window, [&](const Coordinates & id)
481 {
482 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
483 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
484 qint16x4_t lower_res = {};
485 qint16x4_t upper_res = {};
486 if(pooling_type == PoolingType::AVG)
487 {
488 // Calculate scale
489 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
490 const qint16x4_t scale_vec = vdup_n_qs16(scale);
491
492 // Perform pooling
493 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
494 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
495 if(pool_stride_x == 1)
496 {
497 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
498 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
499 }
500 }
501 else
502 {
503 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
504 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
505 if(pool_stride_x == 1)
506 {
507 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
508 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
509 }
510 }
511 if(pool_stride_x == 1)
512 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100513 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100514 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
515 }
516 else
517 {
518 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
519 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100520 },
521 input, output);
522}
523
524template <PoolingType pooling_type>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100525void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
526{
527#ifdef ARM_COMPUTE_ENABLE_FP16
528 Iterator input(_input, window_input);
529 Iterator output(_output, window);
530
531 constexpr const int pool_size = 3;
532 int pool_pad_x = 0;
533 int pool_pad_y = 0;
534 int pool_stride_x = 0;
535 int pool_stride_y = 0;
536 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
537 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
538 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
539 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
540
541 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
542 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
543 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
544
545 execute_window_loop(window, [&](const Coordinates & id)
546 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100547 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
548 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
549 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
550 float16x4_t res = {};
551
552 // Get power of 2 in case of l2 pooling
553 if(pooling_type == PoolingType::L2)
554 {
555 top_data = vmul_f16(top_data, top_data);
556 middle_data = vmul_f16(middle_data, middle_data);
557 bottom_data = vmul_f16(bottom_data, bottom_data);
558 }
559
560 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100561 {
562 // Calculate scale
563 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
564 const float16x4_t scale_v = vdup_n_f16(scale);
565 // Perform pooling
566 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
567 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
568 res = vmul_f16(vpadd_f16(res, res), scale_v);
569 }
570 else
571 {
572 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
573 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
574 res = vpmax_f16(res, res);
575 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100576
577 // Calculate square-root in case of l2 pooling
578 if(pooling_type == PoolingType::L2)
579 {
580 res = vinv_f16(vinvsqrt_f16(res));
581 }
582
Pablo Tello0c34fe22017-06-26 17:17:42 +0100583 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
584 },
585 input, output);
586#else /* ARM_COMPUTE_ENABLE_FP16 */
587 ARM_COMPUTE_UNUSED(window_input);
588 ARM_COMPUTE_UNUSED(window);
589 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
590#endif /* ARM_COMPUTE_ENABLE_FP16 */
591}
592
593template <PoolingType pooling_type>
594void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
595{
596#ifdef ARM_COMPUTE_ENABLE_FP16
597 Iterator input(_input, window_input);
598 Iterator output(_output, window);
599 constexpr int pool_size = 2;
600 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
601 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
602 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
603 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
604 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
605
606 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
607 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
608
609 execute_window_loop(window, [&](const Coordinates & id)
610 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100611 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
612 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100613 float16x8_t res = {};
614
Georgios Pinitascdf51452017-08-31 14:21:36 +0100615 // Get power of 2 in case of l2 pooling
616 if(pooling_type == PoolingType::L2)
617 {
618 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
619 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
620 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
621 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
622 }
623
624 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100625 {
626 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
627 const float16x8_t scale_v = vdupq_n_f16(scale);
628 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
629 }
630 else
631 {
632 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
633 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100634
635 // Calculate square-root in case of l2 pooling
636 if(pooling_type == PoolingType::L2)
637 {
638 res = vinvq_f16(vinvsqrtq_f16(res));
639 }
640
641 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100642 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
643 },
644 input, output);
645#else /* ARM_COMPUTE_ENABLE_FP16 */
646 ARM_COMPUTE_UNUSED(window_input);
647 ARM_COMPUTE_UNUSED(window);
648 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
649#endif /* ARM_COMPUTE_ENABLE_FP16 */
650}
651
652template <PoolingType pooling_type>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100653void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
654{
655 Iterator input(_input, window_input);
656 Iterator output(_output, window);
657
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100658 constexpr int pool_size = 2;
659 int pool_pad_x = 0;
660 int pool_pad_y = 0;
661 int pool_stride_x = 0;
662 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100663 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
664 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
665 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
666 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
667
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100668 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
669 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100670
671 execute_window_loop(window, [&](const Coordinates & id)
672 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100673 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
674 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
675 float32x2_t res = {};
676 float final_res = 0;
677
678 // Get power of 2 in case of l2 pooling
679 if(pooling_type == PoolingType::L2)
680 {
681 top_data = vmul_f32(top_data, top_data);
682 bottom_data = vmul_f32(bottom_data, bottom_data);
683 }
684
685 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100686 {
687 // Calculate scale
688 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
689 const float32x2_t scale_v = vdup_n_f32(scale);
690
691 // Perform pooling
692 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
693 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
694 }
695 else
696 {
697 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
698 res = vpmax_f32(max_data, max_data);
699 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100700 final_res = vget_lane_f32(res, 0);
701
702 // Calculate square-root in case of l2 pooling
703 if(pooling_type == PoolingType::L2)
704 {
705 final_res = sqrt(final_res);
706 }
707
708 // Store result
709 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100710 },
711 input, output);
712}
713
714template <PoolingType pooling_type>
715void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
716{
717 Iterator input(_input, window_input);
718 Iterator output(_output, window);
719
720 const int fixed_point_position = _input->info()->fixed_point_position();
721 constexpr int pool_size = 3;
722 int pool_pad_x = 0;
723 int pool_pad_y = 0;
724 int pool_stride_x = 0;
725 int pool_stride_y = 0;
726 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
727 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
728 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
729 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
730
731 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
732 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
733 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
734
735 execute_window_loop(window, [&](const Coordinates & id)
736 {
737 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
738 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
739 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
740 qint8x8_t res = {};
741 if(pooling_type == PoolingType::AVG)
742 {
743 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100744 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100745
746 // Perform pooling for stride 2
747 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
748 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
749 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
750 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
751 if(pool_stride_x == 2)
752 {
753 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
754 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100755 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100756 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100757 res = vqmul_qs8(res, scale_vec, fixed_point_position);
758 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100759 }
760 else
761 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100762 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
763 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100764 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765 }
766 else
767 {
768 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
769 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
770 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
771 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
772
773 if(pool_stride_x == 2)
774 {
775 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
776 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
777 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100778 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100779 }
780 else
781 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100782 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100783 }
784 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100785 },
786 input, output);
787}
788
789template <PoolingType pooling_type>
790void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
791{
792 Iterator input(_input, window_input);
793 Iterator output(_output, window);
794
795 const int fixed_point_position = _input->info()->fixed_point_position();
796 constexpr int pool_size = 3;
797 int pool_pad_x = 0;
798 int pool_pad_y = 0;
799 int pool_stride_x = 0;
800 int pool_stride_y = 0;
801 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
802 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
803 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
804 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
805
806 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
807 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
808 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
809
810 execute_window_loop(window, [&](const Coordinates & id)
811 {
812 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
813 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
814 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
815
816 if(pooling_type == PoolingType::AVG)
817 {
818 // Calculate scale
819 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
820
821 // Perform pooling for stride 2
822 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
823 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
824 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
825 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
826 if(pool_stride_x == 2)
827 {
828 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
829 const qint16x4_t scale_vec = vdup_n_qs16(scale);
830 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
831 }
832 else
833 {
834 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
835 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
836 }
837 }
838 else
839 {
840 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
841 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
842 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
843 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
844
845 if(pool_stride_x == 2)
846 {
847 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
848 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
849 }
850 else
851 {
852 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
853 }
854 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100855 },
856 input, output);
857}
858
859template <PoolingType pooling_type>
860void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
861{
862 Iterator input(_input, window_input);
863 Iterator output(_output, window);
864
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100865 constexpr const int pool_size = 3;
866 int pool_pad_x = 0;
867 int pool_pad_y = 0;
868 int pool_stride_x = 0;
869 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100870 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
871 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
872 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
873 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
874
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100875 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
876 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
877 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100878
879 execute_window_loop(window, [&](const Coordinates & id)
880 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100881 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
882 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
883 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
884 float32x2_t res = {};
885 float final_res = 0;
886
887 // Get power of 2 in case of l2 pooling
888 if(pooling_type == PoolingType::L2)
889 {
890 top_data = vmulq_f32(top_data, top_data);
891 middle_data = vmulq_f32(middle_data, middle_data);
892 bottom_data = vmulq_f32(bottom_data, bottom_data);
893 }
894
895 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100896 {
897 // Calculate scale
898 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
899 const float32x2_t scale_v = vdup_n_f32(scale);
900
901 // Perform pooling
902 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
903 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
904 res = vmul_f32(vpadd_f32(res, res), scale_v);
905 }
906 else
907 {
908 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
909 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
910 res = vpmax_f32(res, res);
911 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100912 final_res = vget_lane_f32(res, 0);
913
914 // Calculate square-root in case of l2 pooling
915 if(pooling_type == PoolingType::L2)
916 {
917 final_res = sqrt(final_res);
918 }
919
920 // Store result
921 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100922 },
923 input, output);
924}
925
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100926template <PoolingType pooling_type>
927void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
928{
929 Iterator input(_input, window_input);
930 Iterator output(_output, window);
931
932 constexpr const int pool_size = 7;
933 int pool_pad_x = 0;
934 int pool_pad_y = 0;
935 int pool_stride_x = 0;
936 int pool_stride_y = 0;
937 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
938 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
939 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
940 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
941
942 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
943 for(int i = 0; i < pool_size; ++i)
944 {
945 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
946 }
947
948 execute_window_loop(window, [&](const Coordinates & id)
949 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100950 float32x2_t res = {};
951 float final_res = 0.f;
952 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100953 {
954 // Calculate scale
955 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
956 const float32x2_t scale_v = vdup_n_f32(scale);
957
958 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +0100959 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
960 // Get power of 2 in case of l2 pooling
961 if(pooling_type == PoolingType::L2)
962 {
963 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
964 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
965 }
966 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100967 for(int i = 1; i < pool_size; ++i)
968 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100969 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
970 // Get power of 2 in case of l2 pooling
971 if(pooling_type == PoolingType::L2)
972 {
973 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
974 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
975 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100976 sum_data = vaddq_f32(sum_data, data.val[0]);
977 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
978 }
979 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
980 res = vmul_f32(vpadd_f32(res, res), scale_v);
981 }
982 else
983 {
984 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
985 for(int i = 1; i < pool_size; ++i)
986 {
987 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
988 max_data = vmax2q_f32(max_data, data);
989 }
990 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
991 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
992 res = vpmax_f32(res, res);
993 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100994 final_res = vget_lane_f32(res, 0);
995
996 // Calculate square-root in case of l2 pooling
997 if(pooling_type == PoolingType::L2)
998 {
999 final_res = sqrt(final_res);
1000 }
1001
1002 // Store result
1003 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001004 },
1005 input, output);
1006}
1007
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001008void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001009{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001010 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001011 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1012 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1013 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1014
Pablo Tello0c34fe22017-06-26 17:17:42 +01001015 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1016 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001017
1018 // Set step for input in x and y direction for the input
1019 Window window_input(window);
1020 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001021 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001022 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001023 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001024 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001025 case DataType::F16:
1026 {
1027 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1028 break;
1029 }
1030 case DataType::F32:
1031 {
1032 window_x_inc = pool_stride_x;
1033 break;
1034 }
1035 default:
1036 {
1037 ARM_COMPUTE_ERROR("Not supported");
1038 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001039 }
1040 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1041 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1042
1043 // Run function
1044 (this->*_func)(window_input, window);
1045}