blob: df56c23800585483650e1d2450c6854338da5850 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
32#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Utils.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36
37#include <algorithm>
38#include <arm_neon.h>
39#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010040#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <string>
42#include <tuple>
43
44using namespace arm_compute;
45
46namespace
47{
48inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
49 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
50{
Pablo Tello0c34fe22017-06-26 17:17:42 +010051 const int start_x = id.x() * stride_x - pad_x;
52 const int start_y = id.y() * stride_y - pad_y;
53 const int end_x = std::min(start_x + pool_size, upper_bound_w);
54 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055 return 1.f / ((end_y - start_y) * (end_x - start_x));
56}
57
58inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
59 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
60{
Pablo Tello0c34fe22017-06-26 17:17:42 +010061 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010062 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
63 const int start_x = id.x() * stride_x - pad_x;
64 const int start_y = id.y() * stride_y - pad_y;
65 const int end_x = std::min(start_x + pool_size, upper_bound_w);
66 const int end_y = std::min(start_y + pool_size, upper_bound_h);
67 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010068 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
69}
70
71inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
72 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
73{
74 static std::array<qint16_t, 10> scale_values_q16 =
75 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
76 const int start_x = id.x() * stride_x - pad_x;
77 const int start_y = id.y() * stride_y - pad_y;
78 const int end_x = std::min(start_x + pool_size, upper_bound_w);
79 const int end_y = std::min(start_y + pool_size, upper_bound_h);
80 const int val = ((end_y - start_y) * (end_x - start_x));
81 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082}
83} // namespace
84
85NEPoolingLayerKernel::NEPoolingLayerKernel()
86 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
87{
88}
89
90BorderSize NEPoolingLayerKernel::border_size() const
91{
92 return _border_size;
93}
94
95void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
96{
Gian Marco Iodice4e288692017-06-27 11:41:59 +010097 int pool_pad_x = 0;
98 int pool_pad_y = 0;
99 int pool_stride_x = 0;
100 int pool_stride_y = 0;
101 unsigned int pooled_w = 0;
102 unsigned int pooled_h = 0;
103 PoolingType pool_type = pool_info.pool_type();
104 int pool_size = pool_info.pool_size();
105 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100106 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
107 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
108
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100109 static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
110 ARM_COMPUTE_UNUSED(supported_pool_sizes);
111
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100112 ARM_COMPUTE_ERROR_ON_NULLPTR(output);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100113 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
114 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
115 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
116 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100117 ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
118 ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100119 ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100120 ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100121
122 // Check output dimensions
123 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
Gian Marco Iodice4e288692017-06-27 11:41:59 +0100124 pool_size, pool_size, pool_info.pad_stride_info());
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100125
126 // Output auto initialization if not yet initialized
127 {
128 TensorShape output_shape{ input->info()->tensor_shape() };
129 output_shape.set(0, pooled_w);
130 output_shape.set(1, pooled_h);
131
132 auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
133 }
134
135 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
136 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100137 ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
138
139 unsigned int num_elems_read_per_iteration = 0;
140 unsigned int num_elems_processed_per_iteration = 0;
141 unsigned int num_elems_horizontal_window = 0;
142
143 // Select element size
144 switch(input->info()->data_type())
145 {
146 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100147 num_elems_read_per_iteration = 16;
148 switch(pool_size)
149 {
150 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100151 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100152 break;
153 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100154 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100155 break;
156 default:
157 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100158 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100159 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100160 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
161 break;
162 case DataType::QS16:
163 num_elems_read_per_iteration = 8;
164 switch(pool_size)
165 {
166 case 2:
167 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
168 break;
169 case 3:
170 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
171 break;
172 default:
173 ARM_COMPUTE_ERROR("Pooling size not supported");
174 }
175 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100176 break;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100177#ifdef ARM_COMPUTE_ENABLE_FP16
178 case DataType::F16:
179 switch(pool_size)
180 {
181 case 2:
182 num_elems_read_per_iteration = 16;
183 num_elems_processed_per_iteration = 8;
184 num_elems_horizontal_window = 8;
185 break;
186 case 3:
187 num_elems_read_per_iteration = 4;
188 num_elems_processed_per_iteration = 1;
189 num_elems_horizontal_window = 1;
190 break;
191 default:
192 ARM_COMPUTE_ERROR("Pooling size not supported");
193 break;
194 }
195 break;
196#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100198 switch(pool_size)
199 {
200 case 2:
201 num_elems_read_per_iteration = 2;
202 break;
203 case 3:
204 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
205 break;
206 case 7:
207 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
208 break;
209 default:
210 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100211 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100212 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100213 num_elems_processed_per_iteration = 1;
214 num_elems_horizontal_window = 1;
215 break;
216 default:
217 ARM_COMPUTE_ERROR("Element size not supported");
218 break;
219 }
220
221 _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
222 const int input_width = input->info()->dimension(0);
223 const int input_height = input->info()->dimension(1);
224 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
225 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
226
227 // Set instance variables
228 _input = input;
229 _output = output;
230 _pool_info = pool_info;
231 _border_size = BorderSize(pool_pad_y, pool_pad_x);
232 _border_size.right = std::max(upper_bound_w, pool_pad_x);
233 _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
234
235 // Select appropriate function
236 switch(pool_size)
237 {
238 case 2:
239 if(input->info()->data_type() == DataType::QS8)
240 {
241 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
242 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100243 else if(input->info()->data_type() == DataType::QS16)
244 {
245 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
246 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100247 else if(input->info()->data_type() == DataType::F16)
248 {
249 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
250 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251 else if(input->info()->data_type() == DataType::F32)
252 {
253 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
254 }
255 break;
256 case 3:
257 if(input->info()->data_type() == DataType::QS8)
258 {
259 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
260 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100261 else if(input->info()->data_type() == DataType::QS16)
262 {
263 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
264 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100265 else if(input->info()->data_type() == DataType::F16)
266 {
267 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
268 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100269 else if(input->info()->data_type() == DataType::F32)
270 {
271 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
272 }
273 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100274 case 7:
275 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
276 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277 default:
278 ARM_COMPUTE_ERROR("Unsupported pooling size");
279 break;
280 }
281
282 // Configure kernel window
283 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
284 AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
285 AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
286 update_window_and_padding(win, input_access, output_access);
287 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
288 INEKernel::configure(win);
289}
290
291template <PoolingType pooling_type>
292void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
293{
294 Iterator input(_input, window_input);
295 Iterator output(_output, window);
296
297 const int fixed_point_position = _input->info()->fixed_point_position();
298 constexpr int pool_size = 2;
299 int pool_pad_x = 0;
300 int pool_pad_y = 0;
301 int pool_stride_x = 0;
302 int pool_stride_y = 0;
303 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
304 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
305 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
306 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
307
308 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
309 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
310
311 execute_window_loop(window, [&](const Coordinates & id)
312 {
313 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
314 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100315 qint8x8_t lower_res = {};
316 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317 if(pooling_type == PoolingType::AVG)
318 {
319 // Calculate scale
320 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
321 const qint8x8_t scale_vec = vdup_n_qs8(scale);
322
323 // Perform pooling
324 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100325 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
326 if(pool_stride_x == 1)
327 {
328 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
329 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
330 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100331 }
332 else
333 {
334 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100335 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
336 if(pool_stride_x == 1)
337 {
338 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
339 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
340 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100341 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100342 if(pool_stride_x == 1)
343 {
344 const qint8x8x2_t res = vzip_s8(lower_res, upper_res);
345 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
346 }
347 else
348 {
349 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
350 }
351 },
352 input, output);
353}
354
355template <PoolingType pooling_type>
356void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
357{
358 Iterator input(_input, window_input);
359 Iterator output(_output, window);
360
361 const int fixed_point_position = _input->info()->fixed_point_position();
362 constexpr int pool_size = 2;
363 int pool_pad_x = 0;
364 int pool_pad_y = 0;
365 int pool_stride_x = 0;
366 int pool_stride_y = 0;
367 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
368 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
369 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
370 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
371
372 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
373 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
374
375 execute_window_loop(window, [&](const Coordinates & id)
376 {
377 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
378 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
379 qint16x4_t lower_res = {};
380 qint16x4_t upper_res = {};
381 if(pooling_type == PoolingType::AVG)
382 {
383 // Calculate scale
384 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
385 const qint16x4_t scale_vec = vdup_n_qs16(scale);
386
387 // Perform pooling
388 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
389 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
390 if(pool_stride_x == 1)
391 {
392 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
393 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
394 }
395 }
396 else
397 {
398 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
399 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
400 if(pool_stride_x == 1)
401 {
402 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
403 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
404 }
405 }
406 if(pool_stride_x == 1)
407 {
408 const qint16x4x2_t res = vzip_s16(lower_res, upper_res);
409 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
410 }
411 else
412 {
413 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
414 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100415 },
416 input, output);
417}
418
419template <PoolingType pooling_type>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100420void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
421{
422#ifdef ARM_COMPUTE_ENABLE_FP16
423 Iterator input(_input, window_input);
424 Iterator output(_output, window);
425
426 constexpr const int pool_size = 3;
427 int pool_pad_x = 0;
428 int pool_pad_y = 0;
429 int pool_stride_x = 0;
430 int pool_stride_y = 0;
431 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
432 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
433 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
434 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
435
436 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
437 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
438 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
439
440 execute_window_loop(window, [&](const Coordinates & id)
441 {
442 const float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
443 const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
444 const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
445 float16x4_t res = {};
446 if(pooling_type == PoolingType::AVG)
447 {
448 // Calculate scale
449 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
450 const float16x4_t scale_v = vdup_n_f16(scale);
451 // Perform pooling
452 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
453 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
454 res = vmul_f16(vpadd_f16(res, res), scale_v);
455 }
456 else
457 {
458 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
459 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
460 res = vpmax_f16(res, res);
461 }
462 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
463 },
464 input, output);
465#else /* ARM_COMPUTE_ENABLE_FP16 */
466 ARM_COMPUTE_UNUSED(window_input);
467 ARM_COMPUTE_UNUSED(window);
468 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
469#endif /* ARM_COMPUTE_ENABLE_FP16 */
470}
471
472template <PoolingType pooling_type>
473void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
474{
475#ifdef ARM_COMPUTE_ENABLE_FP16
476 Iterator input(_input, window_input);
477 Iterator output(_output, window);
478 constexpr int pool_size = 2;
479 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
480 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
481 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
482 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
483 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
484
485 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
486 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
487
488 execute_window_loop(window, [&](const Coordinates & id)
489 {
490 const auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
491 const auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
492 float16x8_t res = {};
493
494 if(pooling_type == PoolingType::AVG)
495 {
496 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
497 const float16x8_t scale_v = vdupq_n_f16(scale);
498 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
499 }
500 else
501 {
502 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
503 }
504 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
505 },
506 input, output);
507#else /* ARM_COMPUTE_ENABLE_FP16 */
508 ARM_COMPUTE_UNUSED(window_input);
509 ARM_COMPUTE_UNUSED(window);
510 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
511#endif /* ARM_COMPUTE_ENABLE_FP16 */
512}
513
514template <PoolingType pooling_type>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100515void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
516{
517 Iterator input(_input, window_input);
518 Iterator output(_output, window);
519
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100520 constexpr int pool_size = 2;
521 int pool_pad_x = 0;
522 int pool_pad_y = 0;
523 int pool_stride_x = 0;
524 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100525 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
526 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
527 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
528 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
529
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100530 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
531 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100532
533 execute_window_loop(window, [&](const Coordinates & id)
534 {
535 const float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
536 const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
537 float32x2_t res = {};
538 if(pooling_type == PoolingType::AVG)
539 {
540 // Calculate scale
541 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
542 const float32x2_t scale_v = vdup_n_f32(scale);
543
544 // Perform pooling
545 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
546 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
547 }
548 else
549 {
550 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
551 res = vpmax_f32(max_data, max_data);
552 }
553 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
554 },
555 input, output);
556}
557
558template <PoolingType pooling_type>
559void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
560{
561 Iterator input(_input, window_input);
562 Iterator output(_output, window);
563
564 const int fixed_point_position = _input->info()->fixed_point_position();
565 constexpr int pool_size = 3;
566 int pool_pad_x = 0;
567 int pool_pad_y = 0;
568 int pool_stride_x = 0;
569 int pool_stride_y = 0;
570 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
571 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
572 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
573 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
574
575 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
576 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
577 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
578
579 execute_window_loop(window, [&](const Coordinates & id)
580 {
581 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
582 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
583 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
584 qint8x8_t res = {};
585 if(pooling_type == PoolingType::AVG)
586 {
587 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100588 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100589
590 // Perform pooling for stride 2
591 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
592 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
593 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
594 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
595 if(pool_stride_x == 2)
596 {
597 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
598 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100599 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100600 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100601 res = vqmul_qs8(res, scale_vec, fixed_point_position);
602 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100603 }
604 else
605 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100606 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
607 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100608 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100609 }
610 else
611 {
612 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
613 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
614 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
615 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
616
617 if(pool_stride_x == 2)
618 {
619 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
620 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
621 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100622 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100623 }
624 else
625 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100626 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627 }
628 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100629 },
630 input, output);
631}
632
633template <PoolingType pooling_type>
634void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
635{
636 Iterator input(_input, window_input);
637 Iterator output(_output, window);
638
639 const int fixed_point_position = _input->info()->fixed_point_position();
640 constexpr int pool_size = 3;
641 int pool_pad_x = 0;
642 int pool_pad_y = 0;
643 int pool_stride_x = 0;
644 int pool_stride_y = 0;
645 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
646 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
647 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
648 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
649
650 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
651 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
652 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
653
654 execute_window_loop(window, [&](const Coordinates & id)
655 {
656 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
657 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
658 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
659
660 if(pooling_type == PoolingType::AVG)
661 {
662 // Calculate scale
663 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
664
665 // Perform pooling for stride 2
666 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
667 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
668 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
669 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
670 if(pool_stride_x == 2)
671 {
672 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
673 const qint16x4_t scale_vec = vdup_n_qs16(scale);
674 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
675 }
676 else
677 {
678 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
679 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
680 }
681 }
682 else
683 {
684 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
685 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
686 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
687 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
688
689 if(pool_stride_x == 2)
690 {
691 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
692 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
693 }
694 else
695 {
696 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
697 }
698 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100699 },
700 input, output);
701}
702
703template <PoolingType pooling_type>
704void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
705{
706 Iterator input(_input, window_input);
707 Iterator output(_output, window);
708
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100709 constexpr const int pool_size = 3;
710 int pool_pad_x = 0;
711 int pool_pad_y = 0;
712 int pool_stride_x = 0;
713 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100714 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
715 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
716 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
717 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
718
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100719 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
720 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
721 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100722
723 execute_window_loop(window, [&](const Coordinates & id)
724 {
725 const float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
726 const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
727 const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
728 float32x2_t res = {};
729 if(pooling_type == PoolingType::AVG)
730 {
731 // Calculate scale
732 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
733 const float32x2_t scale_v = vdup_n_f32(scale);
734
735 // Perform pooling
736 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
737 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
738 res = vmul_f32(vpadd_f32(res, res), scale_v);
739 }
740 else
741 {
742 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
743 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
744 res = vpmax_f32(res, res);
745 }
746 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
747 },
748 input, output);
749}
750
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100751template <PoolingType pooling_type>
752void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
753{
754 Iterator input(_input, window_input);
755 Iterator output(_output, window);
756
757 constexpr const int pool_size = 7;
758 int pool_pad_x = 0;
759 int pool_pad_y = 0;
760 int pool_stride_x = 0;
761 int pool_stride_y = 0;
762 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
763 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
764 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
765 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
766
767 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
768 for(int i = 0; i < pool_size; ++i)
769 {
770 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
771 }
772
773 execute_window_loop(window, [&](const Coordinates & id)
774 {
775 float32x2_t res = {};
776 if(pooling_type == PoolingType::AVG)
777 {
778 // Calculate scale
779 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
780 const float32x2_t scale_v = vdup_n_f32(scale);
781
782 // Perform pooling
783 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
784 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
785 for(int i = 1; i < pool_size; ++i)
786 {
787 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
788 sum_data = vaddq_f32(sum_data, data.val[0]);
789 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
790 }
791 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
792 res = vmul_f32(vpadd_f32(res, res), scale_v);
793 }
794 else
795 {
796 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
797 for(int i = 1; i < pool_size; ++i)
798 {
799 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
800 max_data = vmax2q_f32(max_data, data);
801 }
802 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
803 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
804 res = vpmax_f32(res, res);
805 }
806 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
807 },
808 input, output);
809}
810
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100811void NEPoolingLayerKernel::run(const Window &window)
812{
813 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
814 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
815 ARM_COMPUTE_ERROR_ON(_func == nullptr);
816
Pablo Tello0c34fe22017-06-26 17:17:42 +0100817 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
818 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100819
820 // Set step for input in x and y direction for the input
821 Window window_input(window);
822 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100823 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100824 {
Pablo Tello0c34fe22017-06-26 17:17:42 +0100825 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100826 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +0100827 case DataType::F16:
828 {
829 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
830 break;
831 }
832 case DataType::F32:
833 {
834 window_x_inc = pool_stride_x;
835 break;
836 }
837 default:
838 {
839 ARM_COMPUTE_ERROR("Not supported");
840 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100841 }
842 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
843 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
844
845 // Run function
846 (this->*_func)(window_input, window);
847}