blob: fdcbd5a898f72a09f11748a7b29465e9acc319d2 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
31#include "arm_compute/core/NEON/NEFixedPoint.h"
32#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Utils.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36
37#include <algorithm>
38#include <arm_neon.h>
39#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010040#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <string>
42#include <tuple>
43
44using namespace arm_compute;
45
46namespace
47{
48inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
49 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
50{
Pablo Tello0c34fe22017-06-26 17:17:42 +010051 const int start_x = id.x() * stride_x - pad_x;
52 const int start_y = id.y() * stride_y - pad_y;
53 const int end_x = std::min(start_x + pool_size, upper_bound_w);
54 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055 return 1.f / ((end_y - start_y) * (end_x - start_x));
56}
57
58inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
59 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
60{
Pablo Tello0c34fe22017-06-26 17:17:42 +010061 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010062 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
63 const int start_x = id.x() * stride_x - pad_x;
64 const int start_y = id.y() * stride_y - pad_y;
65 const int end_x = std::min(start_x + pool_size, upper_bound_w);
66 const int end_y = std::min(start_y + pool_size, upper_bound_h);
67 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010068 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
69}
70
71inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
72 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
73{
74 static std::array<qint16_t, 10> scale_values_q16 =
75 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
76 const int start_x = id.x() * stride_x - pad_x;
77 const int start_y = id.y() * stride_y - pad_y;
78 const int end_x = std::min(start_x + pool_size, upper_bound_w);
79 const int end_y = std::min(start_y + pool_size, upper_bound_h);
80 const int val = ((end_y - start_y) * (end_x - start_x));
81 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082}
83} // namespace
84
85NEPoolingLayerKernel::NEPoolingLayerKernel()
86 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
87{
88}
89
90BorderSize NEPoolingLayerKernel::border_size() const
91{
92 return _border_size;
93}
94
95void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
96{
Gian Marco Iodice4e288692017-06-27 11:41:59 +010097 int pool_pad_x = 0;
98 int pool_pad_y = 0;
99 int pool_stride_x = 0;
100 int pool_stride_y = 0;
101 unsigned int pooled_w = 0;
102 unsigned int pooled_h = 0;
103 PoolingType pool_type = pool_info.pool_type();
104 int pool_size = pool_info.pool_size();
105 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100106 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
107 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
108
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100109 static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
110 ARM_COMPUTE_UNUSED(supported_pool_sizes);
111
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100112 ARM_COMPUTE_ERROR_ON_NULLPTR(output);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100113 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100114 ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
115 ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100116 ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100117 ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100118
119 // Check output dimensions
120 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
Gian Marco Iodice4e288692017-06-27 11:41:59 +0100121 pool_size, pool_size, pool_info.pad_stride_info());
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100122
123 // Output auto initialization if not yet initialized
124 {
125 TensorShape output_shape{ input->info()->tensor_shape() };
126 output_shape.set(0, pooled_w);
127 output_shape.set(1, pooled_h);
128
129 auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
130 }
131
132 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
133 ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
135
136 unsigned int num_elems_read_per_iteration = 0;
137 unsigned int num_elems_processed_per_iteration = 0;
138 unsigned int num_elems_horizontal_window = 0;
139
140 // Select element size
141 switch(input->info()->data_type())
142 {
143 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100144 num_elems_read_per_iteration = 16;
145 switch(pool_size)
146 {
147 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100148 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100149 break;
150 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100151 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100152 break;
153 default:
154 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100155 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100156 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100157 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
158 break;
159 case DataType::QS16:
160 num_elems_read_per_iteration = 8;
161 switch(pool_size)
162 {
163 case 2:
164 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
165 break;
166 case 3:
167 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
168 break;
169 default:
170 ARM_COMPUTE_ERROR("Pooling size not supported");
171 }
172 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173 break;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100174#ifdef ARM_COMPUTE_ENABLE_FP16
175 case DataType::F16:
176 switch(pool_size)
177 {
178 case 2:
179 num_elems_read_per_iteration = 16;
180 num_elems_processed_per_iteration = 8;
181 num_elems_horizontal_window = 8;
182 break;
183 case 3:
184 num_elems_read_per_iteration = 4;
185 num_elems_processed_per_iteration = 1;
186 num_elems_horizontal_window = 1;
187 break;
188 default:
189 ARM_COMPUTE_ERROR("Pooling size not supported");
190 break;
191 }
192 break;
193#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100195 switch(pool_size)
196 {
197 case 2:
198 num_elems_read_per_iteration = 2;
199 break;
200 case 3:
201 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
202 break;
203 case 7:
204 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
205 break;
206 default:
207 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100208 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100209 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100210 num_elems_processed_per_iteration = 1;
211 num_elems_horizontal_window = 1;
212 break;
213 default:
214 ARM_COMPUTE_ERROR("Element size not supported");
215 break;
216 }
217
218 _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
219 const int input_width = input->info()->dimension(0);
220 const int input_height = input->info()->dimension(1);
221 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
222 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
223
224 // Set instance variables
225 _input = input;
226 _output = output;
227 _pool_info = pool_info;
228 _border_size = BorderSize(pool_pad_y, pool_pad_x);
229 _border_size.right = std::max(upper_bound_w, pool_pad_x);
230 _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
231
232 // Select appropriate function
233 switch(pool_size)
234 {
235 case 2:
236 if(input->info()->data_type() == DataType::QS8)
237 {
238 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
239 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100240 else if(input->info()->data_type() == DataType::QS16)
241 {
242 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
243 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100244 else if(input->info()->data_type() == DataType::F16)
245 {
246 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
247 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100248 else if(input->info()->data_type() == DataType::F32)
249 {
250 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
251 }
252 break;
253 case 3:
254 if(input->info()->data_type() == DataType::QS8)
255 {
256 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
257 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100258 else if(input->info()->data_type() == DataType::QS16)
259 {
260 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
261 }
Pablo Tello0c34fe22017-06-26 17:17:42 +0100262 else if(input->info()->data_type() == DataType::F16)
263 {
264 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
265 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100266 else if(input->info()->data_type() == DataType::F32)
267 {
268 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
269 }
270 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100271 case 7:
272 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
273 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100274 default:
275 ARM_COMPUTE_ERROR("Unsupported pooling size");
276 break;
277 }
278
279 // Configure kernel window
280 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
281 AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
282 AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
283 update_window_and_padding(win, input_access, output_access);
284 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
285 INEKernel::configure(win);
286}
287
288template <PoolingType pooling_type>
289void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
290{
291 Iterator input(_input, window_input);
292 Iterator output(_output, window);
293
294 const int fixed_point_position = _input->info()->fixed_point_position();
295 constexpr int pool_size = 2;
296 int pool_pad_x = 0;
297 int pool_pad_y = 0;
298 int pool_stride_x = 0;
299 int pool_stride_y = 0;
300 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
301 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
302 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
303 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
304
305 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
306 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
307
308 execute_window_loop(window, [&](const Coordinates & id)
309 {
310 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
311 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100312 qint8x8_t lower_res = {};
313 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100314 if(pooling_type == PoolingType::AVG)
315 {
316 // Calculate scale
317 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
318 const qint8x8_t scale_vec = vdup_n_qs8(scale);
319
320 // Perform pooling
321 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100322 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
323 if(pool_stride_x == 1)
324 {
325 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
326 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
327 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100328 }
329 else
330 {
331 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100332 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
333 if(pool_stride_x == 1)
334 {
335 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
336 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
337 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100338 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100339 if(pool_stride_x == 1)
340 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100341 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100342 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
343 }
344 else
345 {
346 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
347 }
348 },
349 input, output);
350}
351
352template <PoolingType pooling_type>
353void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
354{
355 Iterator input(_input, window_input);
356 Iterator output(_output, window);
357
358 const int fixed_point_position = _input->info()->fixed_point_position();
359 constexpr int pool_size = 2;
360 int pool_pad_x = 0;
361 int pool_pad_y = 0;
362 int pool_stride_x = 0;
363 int pool_stride_y = 0;
364 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
365 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
366 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
367 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
368
369 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
370 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
371
372 execute_window_loop(window, [&](const Coordinates & id)
373 {
374 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
375 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
376 qint16x4_t lower_res = {};
377 qint16x4_t upper_res = {};
378 if(pooling_type == PoolingType::AVG)
379 {
380 // Calculate scale
381 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
382 const qint16x4_t scale_vec = vdup_n_qs16(scale);
383
384 // Perform pooling
385 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
386 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
387 if(pool_stride_x == 1)
388 {
389 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
390 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
391 }
392 }
393 else
394 {
395 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
396 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
397 if(pool_stride_x == 1)
398 {
399 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
400 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
401 }
402 }
403 if(pool_stride_x == 1)
404 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100405 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100406 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
407 }
408 else
409 {
410 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
411 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100412 },
413 input, output);
414}
415
416template <PoolingType pooling_type>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100417void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
418{
419#ifdef ARM_COMPUTE_ENABLE_FP16
420 Iterator input(_input, window_input);
421 Iterator output(_output, window);
422
423 constexpr const int pool_size = 3;
424 int pool_pad_x = 0;
425 int pool_pad_y = 0;
426 int pool_stride_x = 0;
427 int pool_stride_y = 0;
428 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
429 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
430 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
431 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
432
433 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
434 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
435 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
436
437 execute_window_loop(window, [&](const Coordinates & id)
438 {
439 const float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
440 const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
441 const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
442 float16x4_t res = {};
443 if(pooling_type == PoolingType::AVG)
444 {
445 // Calculate scale
446 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
447 const float16x4_t scale_v = vdup_n_f16(scale);
448 // Perform pooling
449 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
450 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
451 res = vmul_f16(vpadd_f16(res, res), scale_v);
452 }
453 else
454 {
455 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
456 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
457 res = vpmax_f16(res, res);
458 }
459 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
460 },
461 input, output);
462#else /* ARM_COMPUTE_ENABLE_FP16 */
463 ARM_COMPUTE_UNUSED(window_input);
464 ARM_COMPUTE_UNUSED(window);
465 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
466#endif /* ARM_COMPUTE_ENABLE_FP16 */
467}
468
469template <PoolingType pooling_type>
470void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
471{
472#ifdef ARM_COMPUTE_ENABLE_FP16
473 Iterator input(_input, window_input);
474 Iterator output(_output, window);
475 constexpr int pool_size = 2;
476 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
477 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
478 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
479 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
480 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
481
482 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
483 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
484
485 execute_window_loop(window, [&](const Coordinates & id)
486 {
487 const auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
488 const auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
489 float16x8_t res = {};
490
491 if(pooling_type == PoolingType::AVG)
492 {
493 const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
494 const float16x8_t scale_v = vdupq_n_f16(scale);
495 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
496 }
497 else
498 {
499 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
500 }
501 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
502 },
503 input, output);
504#else /* ARM_COMPUTE_ENABLE_FP16 */
505 ARM_COMPUTE_UNUSED(window_input);
506 ARM_COMPUTE_UNUSED(window);
507 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
508#endif /* ARM_COMPUTE_ENABLE_FP16 */
509}
510
511template <PoolingType pooling_type>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100512void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
513{
514 Iterator input(_input, window_input);
515 Iterator output(_output, window);
516
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100517 constexpr int pool_size = 2;
518 int pool_pad_x = 0;
519 int pool_pad_y = 0;
520 int pool_stride_x = 0;
521 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100522 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
523 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
524 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
525 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
526
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100527 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
528 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100529
530 execute_window_loop(window, [&](const Coordinates & id)
531 {
532 const float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
533 const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
534 float32x2_t res = {};
535 if(pooling_type == PoolingType::AVG)
536 {
537 // Calculate scale
538 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
539 const float32x2_t scale_v = vdup_n_f32(scale);
540
541 // Perform pooling
542 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
543 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
544 }
545 else
546 {
547 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
548 res = vpmax_f32(max_data, max_data);
549 }
550 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
551 },
552 input, output);
553}
554
555template <PoolingType pooling_type>
556void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
557{
558 Iterator input(_input, window_input);
559 Iterator output(_output, window);
560
561 const int fixed_point_position = _input->info()->fixed_point_position();
562 constexpr int pool_size = 3;
563 int pool_pad_x = 0;
564 int pool_pad_y = 0;
565 int pool_stride_x = 0;
566 int pool_stride_y = 0;
567 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
568 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
569 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
570 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
571
572 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
573 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
574 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
575
576 execute_window_loop(window, [&](const Coordinates & id)
577 {
578 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
579 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
580 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
581 qint8x8_t res = {};
582 if(pooling_type == PoolingType::AVG)
583 {
584 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100585 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100586
587 // Perform pooling for stride 2
588 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
589 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
590 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
591 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
592 if(pool_stride_x == 2)
593 {
594 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
595 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100596 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100597 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100598 res = vqmul_qs8(res, scale_vec, fixed_point_position);
599 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100600 }
601 else
602 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100603 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
604 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100605 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100606 }
607 else
608 {
609 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
610 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
611 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
612 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
613
614 if(pool_stride_x == 2)
615 {
616 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
617 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
618 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100619 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100620 }
621 else
622 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100623 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100624 }
625 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100626 },
627 input, output);
628}
629
630template <PoolingType pooling_type>
631void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
632{
633 Iterator input(_input, window_input);
634 Iterator output(_output, window);
635
636 const int fixed_point_position = _input->info()->fixed_point_position();
637 constexpr int pool_size = 3;
638 int pool_pad_x = 0;
639 int pool_pad_y = 0;
640 int pool_stride_x = 0;
641 int pool_stride_y = 0;
642 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
643 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
644 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
645 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
646
647 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
648 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
649 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
650
651 execute_window_loop(window, [&](const Coordinates & id)
652 {
653 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
654 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
655 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
656
657 if(pooling_type == PoolingType::AVG)
658 {
659 // Calculate scale
660 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
661
662 // Perform pooling for stride 2
663 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
664 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
665 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
666 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
667 if(pool_stride_x == 2)
668 {
669 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
670 const qint16x4_t scale_vec = vdup_n_qs16(scale);
671 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
672 }
673 else
674 {
675 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
676 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
677 }
678 }
679 else
680 {
681 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
682 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
683 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
684 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
685
686 if(pool_stride_x == 2)
687 {
688 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
689 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
690 }
691 else
692 {
693 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
694 }
695 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100696 },
697 input, output);
698}
699
700template <PoolingType pooling_type>
701void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
702{
703 Iterator input(_input, window_input);
704 Iterator output(_output, window);
705
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100706 constexpr const int pool_size = 3;
707 int pool_pad_x = 0;
708 int pool_pad_y = 0;
709 int pool_stride_x = 0;
710 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100711 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
712 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
713 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
714 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
715
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100716 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
717 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
718 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100719
720 execute_window_loop(window, [&](const Coordinates & id)
721 {
722 const float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
723 const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
724 const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
725 float32x2_t res = {};
726 if(pooling_type == PoolingType::AVG)
727 {
728 // Calculate scale
729 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
730 const float32x2_t scale_v = vdup_n_f32(scale);
731
732 // Perform pooling
733 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
734 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
735 res = vmul_f32(vpadd_f32(res, res), scale_v);
736 }
737 else
738 {
739 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
740 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
741 res = vpmax_f32(res, res);
742 }
743 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
744 },
745 input, output);
746}
747
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100748template <PoolingType pooling_type>
749void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
750{
751 Iterator input(_input, window_input);
752 Iterator output(_output, window);
753
754 constexpr const int pool_size = 7;
755 int pool_pad_x = 0;
756 int pool_pad_y = 0;
757 int pool_stride_x = 0;
758 int pool_stride_y = 0;
759 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
760 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
761 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
762 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
763
764 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
765 for(int i = 0; i < pool_size; ++i)
766 {
767 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
768 }
769
770 execute_window_loop(window, [&](const Coordinates & id)
771 {
772 float32x2_t res = {};
773 if(pooling_type == PoolingType::AVG)
774 {
775 // Calculate scale
776 float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
777 const float32x2_t scale_v = vdup_n_f32(scale);
778
779 // Perform pooling
780 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
781 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
782 for(int i = 1; i < pool_size; ++i)
783 {
784 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
785 sum_data = vaddq_f32(sum_data, data.val[0]);
786 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
787 }
788 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
789 res = vmul_f32(vpadd_f32(res, res), scale_v);
790 }
791 else
792 {
793 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
794 for(int i = 1; i < pool_size; ++i)
795 {
796 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
797 max_data = vmax2q_f32(max_data, data);
798 }
799 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
800 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
801 res = vpmax_f32(res, res);
802 }
803 *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
804 },
805 input, output);
806}
807
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100808void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100809{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100810 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100811 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
812 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
813 ARM_COMPUTE_ERROR_ON(_func == nullptr);
814
Pablo Tello0c34fe22017-06-26 17:17:42 +0100815 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
816 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100817
818 // Set step for input in x and y direction for the input
819 Window window_input(window);
820 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100821 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100822 {
Pablo Tello0c34fe22017-06-26 17:17:42 +0100823 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100824 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +0100825 case DataType::F16:
826 {
827 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
828 break;
829 }
830 case DataType::F32:
831 {
832 window_x_inc = pool_stride_x;
833 break;
834 }
835 default:
836 {
837 ARM_COMPUTE_ERROR("Not supported");
838 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100839 }
840 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
841 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
842
843 // Run function
844 (this->*_func)(window_input, window);
845}