blob: 47372c2d5d928da5fd898cfbd2cfc9998982e0b2 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010032#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Utils.h"
35#include "arm_compute/core/Validate.h"
36#include "arm_compute/core/Window.h"
37
38#include <algorithm>
39#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010040#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010042#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <string>
44#include <tuple>
45
46using namespace arm_compute;
47
48namespace
49{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000050void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
51{
52 TensorShape output_shape{ input->tensor_shape() };
53 output_shape.set(0, pooled_w);
54 output_shape.set(1, pooled_h);
55
56 auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
57}
58
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000059template <bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010060inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
61 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
62{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000063 int start_x = id.x() * stride_x - pad_x;
64 int start_y = id.y() * stride_y - pad_y;
Pablo Tello0c34fe22017-06-26 17:17:42 +010065 const int end_x = std::min(start_x + pool_size, upper_bound_w);
66 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000067 if(exclude_padding)
68 {
69 start_x = std::max(0, start_x);
70 start_y = std::max(0, start_y);
71 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072 return 1.f / ((end_y - start_y) * (end_x - start_x));
73}
74
75inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
76 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
77{
Pablo Tello0c34fe22017-06-26 17:17:42 +010078 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
80 const int start_x = id.x() * stride_x - pad_x;
81 const int start_y = id.y() * stride_y - pad_y;
82 const int end_x = std::min(start_x + pool_size, upper_bound_w);
83 const int end_y = std::min(start_y + pool_size, upper_bound_h);
84 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010085 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
86}
87
88inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
89 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
90{
91 static std::array<qint16_t, 10> scale_values_q16 =
92 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
93 const int start_x = id.x() * stride_x - pad_x;
94 const int start_y = id.y() * stride_y - pad_y;
95 const int end_x = std::min(start_x + pool_size, upper_bound_w);
96 const int end_y = std::min(start_y + pool_size, upper_bound_h);
97 const int val = ((end_y - start_y) * (end_x - start_x));
98 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000101Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000103 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100104
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000105 int pool_pad_x = 0;
106 int pool_pad_y = 0;
107 int pool_stride_x = 0;
108 int pool_stride_y = 0;
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000109 PoolingType pool_type = pool_info.pool_type();
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000110 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
111 const bool exclude_padding = pool_info.exclude_padding();
112 const bool is_global_pooling = pool_info.is_global_pooling();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100113 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
114 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Gian Marco Iodice16824302017-09-28 15:41:37 +0100115 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100116
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000117 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
118 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->data_type()));
119 ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->data_type() != DataType::F32));
120 ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
121 ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
122 ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
123 ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000125 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100126 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000127 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
128 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
129 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100130 }
131
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000132 return Status{};
133}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000135Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
136{
137 const bool is_global_pooling = pool_info.is_global_pooling();
138 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
139 "Global pooling is supported only with rectangular inputs!");
140 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
141 "Invalid pool size and pool pad combination!");
142
143 return Status{};
144}
145
146std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
147 BorderSize &border_size,
148 unsigned int pooled_w, unsigned int pooled_h, int pool_size)
149{
150 unsigned int num_elems_read_per_iteration = 0;
151 unsigned int num_elems_horizontal_window = 0;
152 int pool_pad_x = 0;
153 int pool_pad_y = 0;
154 int pool_stride_x = 0;
155 int pool_stride_y = 0;
156 const int input_width = input->dimension(0);
157 const int input_height = input->dimension(1);
158 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
159 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
160 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
161
162 // Check output dimensions
163 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
164 input->dimension(1),
165 pool_size,
166 pool_size,
167 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100168
169 // Select element size
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000170 switch(input->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100171 {
172 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100173 num_elems_read_per_iteration = 16;
174 switch(pool_size)
175 {
176 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100177 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100178 break;
179 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100180 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100181 break;
182 default:
183 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100184 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100185 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100186 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
187 break;
188 case DataType::QS16:
189 num_elems_read_per_iteration = 8;
190 switch(pool_size)
191 {
192 case 2:
193 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
194 break;
195 case 3:
196 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
197 break;
198 default:
199 ARM_COMPUTE_ERROR("Pooling size not supported");
200 }
201 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100202 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000203#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100204 case DataType::F16:
205 switch(pool_size)
206 {
207 case 2:
208 num_elems_read_per_iteration = 16;
209 num_elems_processed_per_iteration = 8;
210 num_elems_horizontal_window = 8;
211 break;
212 case 3:
213 num_elems_read_per_iteration = 4;
214 num_elems_processed_per_iteration = 1;
215 num_elems_horizontal_window = 1;
216 break;
217 default:
218 ARM_COMPUTE_ERROR("Pooling size not supported");
219 break;
220 }
221 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000222#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100223 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100224 switch(pool_size)
225 {
226 case 2:
227 num_elems_read_per_iteration = 2;
228 break;
229 case 3:
230 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
231 break;
232 case 7:
233 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
234 break;
235 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100236 num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
Pablo Tello0c34fe22017-06-26 17:17:42 +0100237 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100238 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100239 num_elems_processed_per_iteration = 1;
240 num_elems_horizontal_window = 1;
241 break;
242 default:
243 ARM_COMPUTE_ERROR("Element size not supported");
244 break;
245 }
246
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000247 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
248 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
249
250 border_size = BorderSize(pool_pad_y, pool_pad_x);
251 border_size.right = std::max(upper_bound_w, pool_pad_x);
252 border_size.bottom = std::max(upper_bound_h, pool_pad_y);
253 bool window_changed = false;
254
255 TensorShape output_shape{ input->tensor_shape() };
256 output_shape.set(0, pooled_w);
257 output_shape.set(1, pooled_h);
258 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
259
260 Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
261 AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
262
263 if(output->total_size() != 0)
264 {
265 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
266 window_changed = update_window_and_padding(win, input_access, output_access);
267 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
268 }
269 else
270 {
271 window_changed = update_window_and_padding(win, input_access);
272 }
273
274 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
275 return std::make_pair(err, win);
276}
277} // namespace
278
279NEPoolingLayerKernel::NEPoolingLayerKernel()
280 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
281{
282}
283
284BorderSize NEPoolingLayerKernel::border_size() const
285{
286 return _border_size;
287}
288
289void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
290{
291 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
292
293 int pool_pad_x = 0;
294 int pool_pad_y = 0;
295 int pool_stride_x = 0;
296 int pool_stride_y = 0;
297 unsigned int pooled_w = 0;
298 unsigned int pooled_h = 0;
299 PoolingType pool_type = pool_info.pool_type();
300 int pool_size = pool_info.pool_size();
301 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
302 const bool exclude_padding = pool_info.exclude_padding();
303 const bool is_global_pooling = pool_info.is_global_pooling();
304 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
305 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
306
307 // Update pool size in case of global pooling
308 pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
309
310 // Validate pool info before calling scaled_dimensions
311 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
312
313 // Check output dimensions
314 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
315 input->info()->dimension(1),
316 pool_size,
317 pool_size,
318 pool_info.pad_stride_info());
319
320 // Output auto initialization if not yet initialized
321 auto_init(input->info(), output->info(), pooled_w, pooled_h);
322
323 // Perform validation step
324 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325
326 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000327 _input = input;
328 _output = output;
329 _pool_info = pool_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100330
331 // Select appropriate function
332 switch(pool_size)
333 {
334 case 2:
335 if(input->info()->data_type() == DataType::QS8)
336 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100337 switch(pool_type)
338 {
339 case PoolingType::AVG:
340 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
341 break;
342 case PoolingType::MAX:
343 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
344 break;
345 default:
346 ARM_COMPUTE_ERROR("Unsupported pooling type!");
347 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100348 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100349 else if(input->info()->data_type() == DataType::QS16)
350 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100351 switch(pool_type)
352 {
353 case PoolingType::AVG:
354 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
355 break;
356 case PoolingType::MAX:
357 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
358 break;
359 default:
360 ARM_COMPUTE_ERROR("Unsupported pooling type!");
361 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100362 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100363 else if(input->info()->data_type() == DataType::F16)
364 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100365 switch(pool_type)
366 {
367 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000368 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100369 break;
370 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000371 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100372 break;
373 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000374 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100375 break;
376 default:
377 ARM_COMPUTE_ERROR("Unsupported pooling type!");
378 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100379 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100380 else if(input->info()->data_type() == DataType::F32)
381 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100382 switch(pool_type)
383 {
384 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000385 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100386 break;
387 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000388 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100389 break;
390 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000391 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100392 break;
393 default:
394 ARM_COMPUTE_ERROR("Unsupported pooling type!");
395 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100396 }
397 break;
398 case 3:
399 if(input->info()->data_type() == DataType::QS8)
400 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100401 switch(pool_type)
402 {
403 case PoolingType::AVG:
404 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
405 break;
406 case PoolingType::MAX:
407 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
408 break;
409 default:
410 ARM_COMPUTE_ERROR("Unsupported pooling type!");
411 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100412 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100413 else if(input->info()->data_type() == DataType::QS16)
414 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100415 switch(pool_type)
416 {
417 case PoolingType::AVG:
418 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
419 break;
420 case PoolingType::MAX:
421 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
422 break;
423 default:
424 ARM_COMPUTE_ERROR("Unsupported pooling type!");
425 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100426 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100427 else if(input->info()->data_type() == DataType::F16)
428 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100429 switch(pool_type)
430 {
431 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000432 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100433 break;
434 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000435 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100436 break;
437 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000438 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100439 break;
440 default:
441 ARM_COMPUTE_ERROR("Unsupported pooling type!");
442 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100443 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100444 else if(input->info()->data_type() == DataType::F32)
445 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100446 switch(pool_type)
447 {
448 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000449 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100450 break;
451 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000452 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100453 break;
454 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000455 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100456 break;
457 default:
458 ARM_COMPUTE_ERROR("Unsupported pooling type!");
459 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460 }
461 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100462 case 7:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100463 switch(pool_type)
464 {
465 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000466 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100467 break;
468 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000469 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100470 break;
471 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000472 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100473 break;
474 default:
475 ARM_COMPUTE_ERROR("Unsupported pooling type!");
476 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100477 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100479 switch(pool_type)
480 {
481 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000482 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
Gian Marco Iodice16824302017-09-28 15:41:37 +0100483 break;
484 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000485 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
Gian Marco Iodice16824302017-09-28 15:41:37 +0100486 break;
487 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000488 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
Gian Marco Iodice16824302017-09-28 15:41:37 +0100489 break;
490 default:
491 ARM_COMPUTE_ERROR("Unsupported pooling type!");
492 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100493 break;
494 }
495
496 // Configure kernel window
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000497 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
498 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
499 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100500}
501
502template <PoolingType pooling_type>
503void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
504{
505 Iterator input(_input, window_input);
506 Iterator output(_output, window);
507
508 const int fixed_point_position = _input->info()->fixed_point_position();
509 constexpr int pool_size = 2;
510 int pool_pad_x = 0;
511 int pool_pad_y = 0;
512 int pool_stride_x = 0;
513 int pool_stride_y = 0;
514 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
515 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
516 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
517 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
518
519 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
520 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
521
522 execute_window_loop(window, [&](const Coordinates & id)
523 {
524 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
525 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100526 qint8x8_t lower_res = {};
527 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100528 if(pooling_type == PoolingType::AVG)
529 {
530 // Calculate scale
531 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
532 const qint8x8_t scale_vec = vdup_n_qs8(scale);
533
534 // Perform pooling
535 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100536 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
537 if(pool_stride_x == 1)
538 {
539 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
540 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
541 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100542 }
543 else
544 {
545 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100546 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
547 if(pool_stride_x == 1)
548 {
549 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
550 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
551 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100552 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100553 if(pool_stride_x == 1)
554 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100555 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100556 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
557 }
558 else
559 {
560 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
561 }
562 },
563 input, output);
564}
565
566template <PoolingType pooling_type>
567void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
568{
569 Iterator input(_input, window_input);
570 Iterator output(_output, window);
571
572 const int fixed_point_position = _input->info()->fixed_point_position();
573 constexpr int pool_size = 2;
574 int pool_pad_x = 0;
575 int pool_pad_y = 0;
576 int pool_stride_x = 0;
577 int pool_stride_y = 0;
578 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
579 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
580 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
581 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
582
583 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
584 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
585
586 execute_window_loop(window, [&](const Coordinates & id)
587 {
588 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
589 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
590 qint16x4_t lower_res = {};
591 qint16x4_t upper_res = {};
592 if(pooling_type == PoolingType::AVG)
593 {
594 // Calculate scale
595 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
596 const qint16x4_t scale_vec = vdup_n_qs16(scale);
597
598 // Perform pooling
599 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
600 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
601 if(pool_stride_x == 1)
602 {
603 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
604 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
605 }
606 }
607 else
608 {
609 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
610 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
611 if(pool_stride_x == 1)
612 {
613 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
614 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
615 }
616 }
617 if(pool_stride_x == 1)
618 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100619 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100620 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
621 }
622 else
623 {
624 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
625 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100626 },
627 input, output);
628}
629
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000630template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100631void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
632{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000633#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100634 Iterator input(_input, window_input);
635 Iterator output(_output, window);
636
637 constexpr const int pool_size = 3;
638 int pool_pad_x = 0;
639 int pool_pad_y = 0;
640 int pool_stride_x = 0;
641 int pool_stride_y = 0;
642 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
643 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000644 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
645 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100646
647 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
648 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
649 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
650
651 execute_window_loop(window, [&](const Coordinates & id)
652 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100653 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
654 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
655 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
656 float16x4_t res = {};
657
658 // Get power of 2 in case of l2 pooling
659 if(pooling_type == PoolingType::L2)
660 {
661 top_data = vmul_f16(top_data, top_data);
662 middle_data = vmul_f16(middle_data, middle_data);
663 bottom_data = vmul_f16(bottom_data, bottom_data);
664 }
665
666 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100667 {
668 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000669 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100670 const float16x4_t scale_v = vdup_n_f16(scale);
671 // Perform pooling
672 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
673 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
674 res = vmul_f16(vpadd_f16(res, res), scale_v);
675 }
676 else
677 {
678 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
679 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
680 res = vpmax_f16(res, res);
681 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100682
683 // Calculate square-root in case of l2 pooling
684 if(pooling_type == PoolingType::L2)
685 {
686 res = vinv_f16(vinvsqrt_f16(res));
687 }
688
Pablo Tello0c34fe22017-06-26 17:17:42 +0100689 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
690 },
691 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000692#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100693 ARM_COMPUTE_UNUSED(window_input);
694 ARM_COMPUTE_UNUSED(window);
695 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000696#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100697}
698
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000699template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100700void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
701{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000702#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100703 Iterator input(_input, window_input);
704 Iterator output(_output, window);
705 constexpr int pool_size = 2;
706 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
707 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
708 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000709 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
710 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100711
712 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
713 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
714
715 execute_window_loop(window, [&](const Coordinates & id)
716 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100717 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
718 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100719 float16x8_t res = {};
720
Georgios Pinitascdf51452017-08-31 14:21:36 +0100721 // Get power of 2 in case of l2 pooling
722 if(pooling_type == PoolingType::L2)
723 {
724 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
725 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
726 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
727 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
728 }
729
730 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100731 {
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000732 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100733 const float16x8_t scale_v = vdupq_n_f16(scale);
734 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
735 }
736 else
737 {
738 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
739 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100740
741 // Calculate square-root in case of l2 pooling
742 if(pooling_type == PoolingType::L2)
743 {
744 res = vinvq_f16(vinvsqrtq_f16(res));
745 }
746
747 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100748 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
749 },
750 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000751#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100752 ARM_COMPUTE_UNUSED(window_input);
753 ARM_COMPUTE_UNUSED(window);
754 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000755#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100756}
757
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000758template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100759void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
760{
761 Iterator input(_input, window_input);
762 Iterator output(_output, window);
763
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100764 constexpr int pool_size = 2;
765 int pool_pad_x = 0;
766 int pool_pad_y = 0;
767 int pool_stride_x = 0;
768 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100769 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
770 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000771 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
772 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100773
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100774 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
775 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100776
777 execute_window_loop(window, [&](const Coordinates & id)
778 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100779 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
780 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
781 float32x2_t res = {};
782 float final_res = 0;
783
784 // Get power of 2 in case of l2 pooling
785 if(pooling_type == PoolingType::L2)
786 {
787 top_data = vmul_f32(top_data, top_data);
788 bottom_data = vmul_f32(bottom_data, bottom_data);
789 }
790
791 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100792 {
793 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000794 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100795 const float32x2_t scale_v = vdup_n_f32(scale);
796
797 // Perform pooling
798 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
799 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
800 }
801 else
802 {
803 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
804 res = vpmax_f32(max_data, max_data);
805 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100806 final_res = vget_lane_f32(res, 0);
807
808 // Calculate square-root in case of l2 pooling
809 if(pooling_type == PoolingType::L2)
810 {
811 final_res = sqrt(final_res);
812 }
813
814 // Store result
815 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100816 },
817 input, output);
818}
819
820template <PoolingType pooling_type>
821void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
822{
823 Iterator input(_input, window_input);
824 Iterator output(_output, window);
825
826 const int fixed_point_position = _input->info()->fixed_point_position();
827 constexpr int pool_size = 3;
828 int pool_pad_x = 0;
829 int pool_pad_y = 0;
830 int pool_stride_x = 0;
831 int pool_stride_y = 0;
832 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
833 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
834 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
835 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
836
837 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
838 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
839 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
840
841 execute_window_loop(window, [&](const Coordinates & id)
842 {
843 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
844 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
845 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
846 qint8x8_t res = {};
847 if(pooling_type == PoolingType::AVG)
848 {
849 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100850 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100851
852 // Perform pooling for stride 2
853 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
854 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
855 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
856 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
857 if(pool_stride_x == 2)
858 {
859 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
860 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100861 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100862 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100863 res = vqmul_qs8(res, scale_vec, fixed_point_position);
864 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100865 }
866 else
867 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100868 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
869 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100870 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100871 }
872 else
873 {
874 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
875 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
876 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
877 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
878
879 if(pool_stride_x == 2)
880 {
881 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
882 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
883 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100884 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100885 }
886 else
887 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100888 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100889 }
890 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100891 },
892 input, output);
893}
894
895template <PoolingType pooling_type>
896void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
897{
898 Iterator input(_input, window_input);
899 Iterator output(_output, window);
900
901 const int fixed_point_position = _input->info()->fixed_point_position();
902 constexpr int pool_size = 3;
903 int pool_pad_x = 0;
904 int pool_pad_y = 0;
905 int pool_stride_x = 0;
906 int pool_stride_y = 0;
907 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
908 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
909 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
910 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
911
912 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
913 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
914 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
915
916 execute_window_loop(window, [&](const Coordinates & id)
917 {
918 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
919 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
920 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
921
922 if(pooling_type == PoolingType::AVG)
923 {
924 // Calculate scale
925 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
926
927 // Perform pooling for stride 2
928 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
929 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
930 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
931 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
932 if(pool_stride_x == 2)
933 {
934 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
935 const qint16x4_t scale_vec = vdup_n_qs16(scale);
936 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
937 }
938 else
939 {
940 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
941 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
942 }
943 }
944 else
945 {
946 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
947 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
948 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
949 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
950
951 if(pool_stride_x == 2)
952 {
953 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
954 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
955 }
956 else
957 {
958 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
959 }
960 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100961 },
962 input, output);
963}
964
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000965template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100966void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
967{
968 Iterator input(_input, window_input);
969 Iterator output(_output, window);
970
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100971 constexpr const int pool_size = 3;
972 int pool_pad_x = 0;
973 int pool_pad_y = 0;
974 int pool_stride_x = 0;
975 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100976 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
977 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000978 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
979 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100980
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100981 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
982 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
983 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100984
985 execute_window_loop(window, [&](const Coordinates & id)
986 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100987 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
988 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
989 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
990 float32x2_t res = {};
991 float final_res = 0;
992
993 // Get power of 2 in case of l2 pooling
994 if(pooling_type == PoolingType::L2)
995 {
996 top_data = vmulq_f32(top_data, top_data);
997 middle_data = vmulq_f32(middle_data, middle_data);
998 bottom_data = vmulq_f32(bottom_data, bottom_data);
999 }
1000
1001 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001002 {
1003 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001004 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001005 const float32x2_t scale_v = vdup_n_f32(scale);
1006
1007 // Perform pooling
1008 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1009 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1010 res = vmul_f32(vpadd_f32(res, res), scale_v);
1011 }
1012 else
1013 {
1014 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1015 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1016 res = vpmax_f32(res, res);
1017 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001018 final_res = vget_lane_f32(res, 0);
1019
1020 // Calculate square-root in case of l2 pooling
1021 if(pooling_type == PoolingType::L2)
1022 {
1023 final_res = sqrt(final_res);
1024 }
1025
1026 // Store result
1027 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001028 },
1029 input, output);
1030}
1031
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001032template <PoolingType pooling_type, bool exclude_padding>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001033void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
1034{
1035 Iterator input(_input, window_input);
1036 Iterator output(_output, window);
1037
1038 constexpr const int pool_size = 7;
1039 int pool_pad_x = 0;
1040 int pool_pad_y = 0;
1041 int pool_stride_x = 0;
1042 int pool_stride_y = 0;
1043 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1044 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001045 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1046 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001047
1048 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1049 for(int i = 0; i < pool_size; ++i)
1050 {
1051 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
1052 }
1053
1054 execute_window_loop(window, [&](const Coordinates & id)
1055 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001056 float32x2_t res = {};
1057 float final_res = 0.f;
1058 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001059 {
1060 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001061 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001062 const float32x2_t scale_v = vdup_n_f32(scale);
1063
1064 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +01001065 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1066 // Get power of 2 in case of l2 pooling
1067 if(pooling_type == PoolingType::L2)
1068 {
1069 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1070 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1071 }
1072 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001073 for(int i = 1; i < pool_size; ++i)
1074 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001075 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1076 // Get power of 2 in case of l2 pooling
1077 if(pooling_type == PoolingType::L2)
1078 {
1079 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1080 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1081 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001082 sum_data = vaddq_f32(sum_data, data.val[0]);
1083 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1084 }
1085 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1086 res = vmul_f32(vpadd_f32(res, res), scale_v);
1087 }
1088 else
1089 {
1090 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1091 for(int i = 1; i < pool_size; ++i)
1092 {
1093 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1094 max_data = vmax2q_f32(max_data, data);
1095 }
1096 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1097 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1098 res = vpmax_f32(res, res);
1099 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001100 final_res = vget_lane_f32(res, 0);
1101
1102 // Calculate square-root in case of l2 pooling
1103 if(pooling_type == PoolingType::L2)
1104 {
1105 final_res = sqrt(final_res);
1106 }
1107
1108 // Store result
1109 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001110 },
1111 input, output);
1112}
1113
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001114template <PoolingType pooling_type, bool exclude_padding>
Gian Marco Iodice16824302017-09-28 15:41:37 +01001115void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
1116{
1117 Iterator input(_input, window_input);
1118 Iterator output(_output, window);
1119
Georgios Pinitas4c2dd542017-11-13 12:58:41 +00001120 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001121 int pool_pad_x = 0;
1122 int pool_pad_y = 0;
1123 int pool_stride_x = 0;
1124 int pool_stride_y = 0;
1125 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1126 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001127 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1128 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001129
1130 execute_window_loop(window, [&](const Coordinates & id)
1131 {
1132 float res = 0.0f;
1133
1134 if(pooling_type != PoolingType::MAX)
1135 {
1136 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001137 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001138
1139 // Perform pooling
1140 float32x4_t vres = vdupq_n_f32(0.0f);
1141
1142 for(int y = 0; y < pool_size; ++y)
1143 {
1144 int x = 0;
1145 for(; x <= (pool_size - 4); x += 4)
1146 {
1147 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1148 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1149
1150 // Get power of 2 in case of l2 pooling and accumulate
1151 if(pooling_type == PoolingType::L2)
1152 {
1153 vres = vmlaq_f32(vres, data, data);
1154 }
1155 else
1156 {
1157 vres = vaddq_f32(vres, data);
1158 }
1159 }
1160
1161 // Leftover for loop
1162 for(; x < pool_size; ++x)
1163 {
1164 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1165
1166 // Get power of 2 in case of l2 pooling
1167 if(pooling_type == PoolingType::L2)
1168 {
1169 data *= data;
1170 }
1171
1172 res += data;
1173 }
1174 }
1175
1176#if defined(__aarch64__)
1177 // Reduction operation available on 64 bit architectures only
1178 res += vaddvq_f32(vres);
1179#else // __aarch64__
1180 // Reduction
1181 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1182 tmp = vpadd_f32(tmp, tmp);
1183
1184 res += vget_lane_f32(tmp, 0);
1185#endif // __aarch64__
1186 // Divide by scale
1187 res *= scale;
1188 }
1189 else
1190 {
1191 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
1192 res = std::numeric_limits<float>::min();
1193
1194 for(int y = 0; y < pool_size; ++y)
1195 {
1196 int x = 0;
1197 for(; x <= (pool_size - 4); x += 4)
1198 {
1199 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1200 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1201 vres = vmaxq_f32(vres, data);
1202 }
1203
1204 // Leftover for loop
1205 for(; x < pool_size; ++x)
1206 {
1207 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1208 res = std::max(res, data);
1209 }
1210 }
1211
1212#if defined(__aarch64__)
1213 // Reduction operation available on 64 bit architectures only
1214 res = std::max(vmaxvq_f32(vres), res);
1215#else // __aarch64__
1216 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1217 tmp = vpmax_f32(tmp, tmp);
1218
1219 res = std::max(res, vget_lane_f32(tmp, 0));
1220#endif // __aarch64__
1221 }
1222
1223 // Calculate square-root in case of l2 pooling
1224 if(pooling_type == PoolingType::L2)
1225 {
1226 res = std::sqrt(res);
1227 }
1228
1229 // Store result
1230 *(reinterpret_cast<float *>(output.ptr())) = res;
1231 },
1232 input, output);
1233}
1234
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001235Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1236{
1237 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1238
1239 unsigned int pooled_w = 0;
1240 unsigned int pooled_h = 0;
1241 unsigned int num_elems_processed_per_iteration = 0;
1242 BorderSize border_size(0);
1243
1244 const bool is_global_pooling = pool_info.is_global_pooling();
1245 const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
1246
1247 // Validate pool info befor calling scaled_dimensions
1248 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
1249
1250 // Check output dimensions
1251 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
1252 input->dimension(1),
1253 pool_size,
1254 pool_size,
1255 pool_info.pad_stride_info());
1256
1257 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
1258 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
1259
1260 return Status{};
1261}
1262
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001263void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001264{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001265 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001266 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1267 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1268 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1269
Pablo Tello0c34fe22017-06-26 17:17:42 +01001270 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1271 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001272
1273 // Set step for input in x and y direction for the input
1274 Window window_input(window);
1275 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001276 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001277 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001278 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001279 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001280 case DataType::F16:
1281 {
1282 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1283 break;
1284 }
1285 case DataType::F32:
1286 {
1287 window_x_inc = pool_stride_x;
1288 break;
1289 }
1290 default:
1291 {
1292 ARM_COMPUTE_ERROR("Not supported");
1293 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294 }
1295 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1296 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1297
1298 // Run function
1299 (this->*_func)(window_input, window);
1300}