blob: ac183d2f30933738bcc891ad918e0b14fb6e21ef [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas55186712018-01-08 17:37:12 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
38
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <algorithm>
42#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010043#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010045#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <string>
47#include <tuple>
48
49using namespace arm_compute;
50
51namespace
52{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000053void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
54{
55 TensorShape output_shape{ input->tensor_shape() };
56 output_shape.set(0, pooled_w);
57 output_shape.set(1, pooled_h);
58
59 auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
60}
61
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000062template <bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010063inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
64 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
65{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000066 int start_x = id.x() * stride_x - pad_x;
67 int start_y = id.y() * stride_y - pad_y;
Pablo Tello0c34fe22017-06-26 17:17:42 +010068 const int end_x = std::min(start_x + pool_size, upper_bound_w);
69 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000070 if(exclude_padding)
71 {
72 start_x = std::max(0, start_x);
73 start_y = std::max(0, start_y);
74 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 return 1.f / ((end_y - start_y) * (end_x - start_x));
76}
77
78inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
79 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
80{
Pablo Tello0c34fe22017-06-26 17:17:42 +010081 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
83 const int start_x = id.x() * stride_x - pad_x;
84 const int start_y = id.y() * stride_y - pad_y;
85 const int end_x = std::min(start_x + pool_size, upper_bound_w);
86 const int end_y = std::min(start_y + pool_size, upper_bound_h);
87 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010088 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
89}
90
91inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
92 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
93{
94 static std::array<qint16_t, 10> scale_values_q16 =
95 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
96 const int start_x = id.x() * stride_x - pad_x;
97 const int start_y = id.y() * stride_y - pad_y;
98 const int end_x = std::min(start_x + pool_size, upper_bound_w);
99 const int end_y = std::min(start_y + pool_size, upper_bound_h);
100 const int val = ((end_y - start_y) * (end_x - start_x));
101 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103
Georgios Pinitas55186712018-01-08 17:37:12 +0000104template <bool exclude_padding>
105inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
106 const int pool_size, const int upper_bound_w, const int upper_bound_h,
107 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
108{
109 int start_x = (id.x() + id_offset) * stride_x - pad_x;
110 int start_y = id.y() * stride_y - pad_y;
111 const int end_y = std::min(start_y + pool_size, upper_bound_h);
112 if(exclude_padding)
113 {
114 start_y = std::max(0, start_y);
115 }
116
117 std::array<uint16_t, 8> elems =
118 {
119 {
120 vgetq_lane_u16(v, 0),
121 vgetq_lane_u16(v, 1),
122 vgetq_lane_u16(v, 2),
123 vgetq_lane_u16(v, 3),
124 vgetq_lane_u16(v, 4),
125 vgetq_lane_u16(v, 5),
126 vgetq_lane_u16(v, 6),
127 vgetq_lane_u16(v, 7),
128 }
129 };
130
131 for(auto &el : elems)
132 {
133 int c_start_x = start_x;
134 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
135 if(exclude_padding)
136 {
137 c_start_x = std::max(0, c_start_x);
138 }
139 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
140 el *= scale;
141 start_x += step * stride_x;
142 }
143
144 v = vsetq_lane_u16(elems[0], v, 0);
145 v = vsetq_lane_u16(elems[1], v, 1);
146 v = vsetq_lane_u16(elems[2], v, 2);
147 v = vsetq_lane_u16(elems[3], v, 3);
148 v = vsetq_lane_u16(elems[4], v, 4);
149 v = vsetq_lane_u16(elems[5], v, 5);
150 v = vsetq_lane_u16(elems[6], v, 6);
151 v = vsetq_lane_u16(elems[7], v, 7);
152}
153
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000154Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000156 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000158 int pool_pad_x = 0;
159 int pool_pad_y = 0;
160 int pool_stride_x = 0;
161 int pool_stride_y = 0;
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000162 PoolingType pool_type = pool_info.pool_type();
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000163 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
164 const bool exclude_padding = pool_info.exclude_padding();
165 const bool is_global_pooling = pool_info.is_global_pooling();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100166 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
167 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Gian Marco Iodice16824302017-09-28 15:41:37 +0100168 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100169
Georgios Pinitas55186712018-01-08 17:37:12 +0000170 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
171 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
172 ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000173 ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
174 ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
175 ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
176 ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100177
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000178 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100179 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
181 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
182 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100183 }
184
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000185 return Status{};
186}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100187
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
189{
190 const bool is_global_pooling = pool_info.is_global_pooling();
191 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
192 "Global pooling is supported only with rectangular inputs!");
193 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
194 "Invalid pool size and pool pad combination!");
195
196 return Status{};
197}
198
199std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
200 BorderSize &border_size,
201 unsigned int pooled_w, unsigned int pooled_h, int pool_size)
202{
203 unsigned int num_elems_read_per_iteration = 0;
204 unsigned int num_elems_horizontal_window = 0;
205 int pool_pad_x = 0;
206 int pool_pad_y = 0;
207 int pool_stride_x = 0;
208 int pool_stride_y = 0;
209 const int input_width = input->dimension(0);
210 const int input_height = input->dimension(1);
211 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
212 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
213 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
214
215 // Check output dimensions
216 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
217 input->dimension(1),
218 pool_size,
219 pool_size,
220 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100221
222 // Select element size
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000223 switch(input->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224 {
225 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100226 num_elems_read_per_iteration = 16;
227 switch(pool_size)
228 {
229 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100230 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100231 break;
232 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100233 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100234 break;
235 default:
236 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100237 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100238 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100239 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
240 break;
Georgios Pinitas55186712018-01-08 17:37:12 +0000241 case DataType::QASYMM8:
242 switch(pool_size)
243 {
244 case 2:
245 num_elems_read_per_iteration = 16;
246 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
247 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
248 break;
249 case 3:
250 num_elems_read_per_iteration = 16;
251 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
252 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
253 break;
254 default:
255 num_elems_read_per_iteration = 1;
256 num_elems_processed_per_iteration = 1;
257 num_elems_horizontal_window = 1;
258 break;
259 }
260 break;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100261 case DataType::QS16:
262 num_elems_read_per_iteration = 8;
263 switch(pool_size)
264 {
265 case 2:
266 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
267 break;
268 case 3:
269 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
270 break;
271 default:
272 ARM_COMPUTE_ERROR("Pooling size not supported");
273 }
274 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000276#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100277 case DataType::F16:
278 switch(pool_size)
279 {
280 case 2:
281 num_elems_read_per_iteration = 16;
282 num_elems_processed_per_iteration = 8;
283 num_elems_horizontal_window = 8;
284 break;
285 case 3:
286 num_elems_read_per_iteration = 4;
287 num_elems_processed_per_iteration = 1;
288 num_elems_horizontal_window = 1;
289 break;
290 default:
291 ARM_COMPUTE_ERROR("Pooling size not supported");
292 break;
293 }
294 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000295#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100296 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100297 switch(pool_size)
298 {
299 case 2:
300 num_elems_read_per_iteration = 2;
301 break;
302 case 3:
303 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
304 break;
305 case 7:
306 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
307 break;
308 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100309 num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
Pablo Tello0c34fe22017-06-26 17:17:42 +0100310 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100311 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100312 num_elems_processed_per_iteration = 1;
313 num_elems_horizontal_window = 1;
314 break;
315 default:
316 ARM_COMPUTE_ERROR("Element size not supported");
317 break;
318 }
319
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000320 const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
321 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
322
323 border_size = BorderSize(pool_pad_y, pool_pad_x);
324 border_size.right = std::max(upper_bound_w, pool_pad_x);
325 border_size.bottom = std::max(upper_bound_h, pool_pad_y);
326 bool window_changed = false;
327
328 TensorShape output_shape{ input->tensor_shape() };
329 output_shape.set(0, pooled_w);
330 output_shape.set(1, pooled_h);
331 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
332
333 Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
334 AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
335
336 if(output->total_size() != 0)
337 {
338 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
339 window_changed = update_window_and_padding(win, input_access, output_access);
340 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
341 }
342 else
343 {
344 window_changed = update_window_and_padding(win, input_access);
345 }
346
347 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
348 return std::make_pair(err, win);
349}
350} // namespace
351
352NEPoolingLayerKernel::NEPoolingLayerKernel()
353 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
354{
355}
356
357BorderSize NEPoolingLayerKernel::border_size() const
358{
359 return _border_size;
360}
361
362void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
363{
364 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
365
366 int pool_pad_x = 0;
367 int pool_pad_y = 0;
368 int pool_stride_x = 0;
369 int pool_stride_y = 0;
370 unsigned int pooled_w = 0;
371 unsigned int pooled_h = 0;
372 PoolingType pool_type = pool_info.pool_type();
373 int pool_size = pool_info.pool_size();
374 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
375 const bool exclude_padding = pool_info.exclude_padding();
376 const bool is_global_pooling = pool_info.is_global_pooling();
377 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
378 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
379
380 // Update pool size in case of global pooling
381 pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
382
383 // Validate pool info before calling scaled_dimensions
384 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
385
386 // Check output dimensions
387 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
388 input->info()->dimension(1),
389 pool_size,
390 pool_size,
391 pool_info.pad_stride_info());
392
393 // Output auto initialization if not yet initialized
394 auto_init(input->info(), output->info(), pooled_w, pooled_h);
395
396 // Perform validation step
397 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100398
399 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000400 _input = input;
401 _output = output;
402 _pool_info = pool_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403
Georgios Pinitas55186712018-01-08 17:37:12 +0000404 // Get data type
405 const DataType data_type = input->info()->data_type();
406
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100407 // Select appropriate function
Georgios Pinitas55186712018-01-08 17:37:12 +0000408 if(data_type == DataType::QS8)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100409 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000410 switch(pool_size)
411 {
412 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100413 switch(pool_type)
414 {
415 case PoolingType::AVG:
416 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
417 break;
418 case PoolingType::MAX:
419 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
420 break;
421 default:
422 ARM_COMPUTE_ERROR("Unsupported pooling type!");
423 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000424 break;
425 case 3:
426 switch(pool_type)
427 {
428 case PoolingType::AVG:
429 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
430 break;
431 case PoolingType::MAX:
432 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
433 break;
434 default:
435 ARM_COMPUTE_ERROR("Unsupported pooling type!");
436 }
437 break;
438 default:
439 ARM_COMPUTE_ERROR("Unsupported pooling size!");
440 }
441 }
442 else if(data_type == DataType::QASYMM8)
443 {
444 if(pool_size == 2 && pool_stride_x < 3)
445 {
446 switch(pool_type)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100447 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000448 case PoolingType::AVG:
449 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
450 break;
451 case PoolingType::MAX:
452 _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
453 break;
454 default:
455 ARM_COMPUTE_ERROR("Unsupported pooling type!");
456 }
457 }
458 else if(pool_size == 3 && pool_stride_x < 3)
459 {
460 switch(pool_type)
461 {
462 case PoolingType::AVG:
463 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
464 break;
465 case PoolingType::MAX:
466 _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
467 break;
468 default:
469 ARM_COMPUTE_ERROR("Unsupported pooling type!");
470 }
471 }
472 else
473 {
474 switch(pool_type)
475 {
476 case PoolingType::AVG:
477 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
478 break;
479 case PoolingType::MAX:
480 _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
481 break;
482 default:
483 ARM_COMPUTE_ERROR("Unsupported pooling type!");
484 }
485 }
486 }
487 else if(data_type == DataType::QS16)
488 {
489 switch(pool_size)
490 {
491 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100492 switch(pool_type)
493 {
494 case PoolingType::AVG:
495 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
496 break;
497 case PoolingType::MAX:
498 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
499 break;
500 default:
501 ARM_COMPUTE_ERROR("Unsupported pooling type!");
502 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000503 break;
504 case 3:
505 switch(pool_type)
506 {
507 case PoolingType::AVG:
508 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
509 break;
510 case PoolingType::MAX:
511 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
512 break;
513 default:
514 ARM_COMPUTE_ERROR("Unsupported pooling type!");
515 }
516 break;
517 default:
518 ARM_COMPUTE_ERROR("Unsupported pooling size!");
519 }
520 }
521 else if(data_type == DataType::F16)
522 {
523 switch(pool_size)
524 {
525 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100526 switch(pool_type)
527 {
528 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000529 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100530 break;
531 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000532 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100533 break;
534 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000535 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100536 break;
537 default:
538 ARM_COMPUTE_ERROR("Unsupported pooling type!");
539 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000540 break;
541 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100542 switch(pool_type)
543 {
544 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000545 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100546 break;
547 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000548 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100549 break;
550 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000551 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100552 break;
553 default:
554 ARM_COMPUTE_ERROR("Unsupported pooling type!");
555 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000556 break;
557 default:
558 ARM_COMPUTE_ERROR("Unsupported pooling size!");
559 }
560 }
561 else if(data_type == DataType::F32)
562 {
563 switch(pool_size)
564 {
565 case 2:
566 switch(pool_type)
567 {
568 case PoolingType::AVG:
569 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
570 break;
571 case PoolingType::L2:
572 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
573 break;
574 case PoolingType::MAX:
575 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
576 break;
577 default:
578 ARM_COMPUTE_ERROR("Unsupported pooling type!");
579 }
580 break;
581 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100582 switch(pool_type)
583 {
584 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000585 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100586 break;
587 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000588 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100589 break;
590 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000591 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100592 break;
593 default:
594 ARM_COMPUTE_ERROR("Unsupported pooling type!");
595 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000596 break;
597 case 7:
598 switch(pool_type)
599 {
600 case PoolingType::AVG:
601 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
602 break;
603 case PoolingType::L2:
604 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
605 break;
606 case PoolingType::MAX:
607 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
608 break;
609 default:
610 ARM_COMPUTE_ERROR("Unsupported pooling type!");
611 }
612 break;
613 default:
614 switch(pool_type)
615 {
616 case PoolingType::AVG:
617 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
618 break;
619 case PoolingType::L2:
620 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
621 break;
622 case PoolingType::MAX:
623 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
624 break;
625 default:
626 ARM_COMPUTE_ERROR("Unsupported pooling type!");
627 }
628 break;
629 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100630 }
631
632 // Configure kernel window
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000633 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
634 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
635 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100636}
637
638template <PoolingType pooling_type>
639void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
640{
641 Iterator input(_input, window_input);
642 Iterator output(_output, window);
643
644 const int fixed_point_position = _input->info()->fixed_point_position();
645 constexpr int pool_size = 2;
646 int pool_pad_x = 0;
647 int pool_pad_y = 0;
648 int pool_stride_x = 0;
649 int pool_stride_y = 0;
650 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
651 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
652 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
653 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
654
655 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
656 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
657
658 execute_window_loop(window, [&](const Coordinates & id)
659 {
660 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
661 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100662 qint8x8_t lower_res = {};
663 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100664 if(pooling_type == PoolingType::AVG)
665 {
666 // Calculate scale
667 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
668 const qint8x8_t scale_vec = vdup_n_qs8(scale);
669
670 // Perform pooling
671 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100672 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
673 if(pool_stride_x == 1)
674 {
675 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
676 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
677 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100678 }
679 else
680 {
681 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100682 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
683 if(pool_stride_x == 1)
684 {
685 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
686 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
687 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100688 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100689 if(pool_stride_x == 1)
690 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100691 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100692 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
693 }
694 else
695 {
696 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
697 }
698 },
699 input, output);
700}
701
Georgios Pinitas55186712018-01-08 17:37:12 +0000702template <PoolingType pooling_type, bool exclude_padding>
703void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
704{
705 Iterator input(_input, window_input);
706 Iterator output(_output, window);
707
708 constexpr int pool_size = 2;
709 int pool_pad_x = 0;
710 int pool_pad_y = 0;
711 int pool_stride_x = 0;
712 int pool_stride_y = 0;
713 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
714 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
715 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
716 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
717
718 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
719 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
720
721 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
722
723 execute_window_loop(window, [&](const Coordinates & id)
724 {
725 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
726 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
727 uint8x8_t lower_res = {};
728 uint8x8_t upper_res = {};
729
730 if(pooling_type != PoolingType::MAX)
731 {
732 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
733 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
734
735 // Add rows
736 const uint16x8x2_t vrsum =
737 {
738 {
739 vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
740 vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
741 }
742 };
743
744 // Pair-wise add row data
745 const uint16x4x2_t vpsum =
746 {
747 {
748 vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
749 vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
750 }
751 };
752
753 uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
754
755 // Scale lower result
756 scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
757 pool_size, upper_bound_w, upper_bound_h,
758 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
759 lower_res = vmovn_u16(res_lower);
760
761 // Compute upper result for stride_x == 1
762 if(pool_stride_x == 1)
763 {
764 // Shifted row sum
765 const uint16x8x2_t vrsum_shifted =
766 {
767 {
768 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
769 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
770 }
771 };
772
773 // Pair-wise add shifted row
774 const uint16x4x2_t vpsum_shifted =
775 {
776 {
777 vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
778 vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
779 }
780 };
781 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
782
783 // Scale lower result
784 scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
785 pool_size, upper_bound_w, upper_bound_h,
786 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
787 upper_res = vmovn_u16(res_upper);
788 }
789 }
790 else
791 {
792 const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
793 lower_res = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
794 if(pool_stride_x == 1)
795 {
796 const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
797 upper_res = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
798 }
799 }
800
801 // Store result
802 if(pool_stride_x == 1)
803 {
804 const uint8x8x2_t res = { { lower_res, upper_res } };
805 vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
806 }
807 else
808 {
809 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
810 }
811 },
812 input, output);
813}
814
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100815template <PoolingType pooling_type>
816void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
817{
818 Iterator input(_input, window_input);
819 Iterator output(_output, window);
820
821 const int fixed_point_position = _input->info()->fixed_point_position();
822 constexpr int pool_size = 2;
823 int pool_pad_x = 0;
824 int pool_pad_y = 0;
825 int pool_stride_x = 0;
826 int pool_stride_y = 0;
827 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
828 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
829 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
830 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
831
832 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
833 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
834
835 execute_window_loop(window, [&](const Coordinates & id)
836 {
837 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
838 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
839 qint16x4_t lower_res = {};
840 qint16x4_t upper_res = {};
841 if(pooling_type == PoolingType::AVG)
842 {
843 // Calculate scale
844 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
845 const qint16x4_t scale_vec = vdup_n_qs16(scale);
846
847 // Perform pooling
848 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
849 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
850 if(pool_stride_x == 1)
851 {
852 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
853 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
854 }
855 }
856 else
857 {
858 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
859 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
860 if(pool_stride_x == 1)
861 {
862 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
863 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
864 }
865 }
866 if(pool_stride_x == 1)
867 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100868 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100869 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
870 }
871 else
872 {
873 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
874 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100875 },
876 input, output);
877}
878
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000879template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100880void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
881{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000882#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100883 Iterator input(_input, window_input);
884 Iterator output(_output, window);
885
886 constexpr const int pool_size = 3;
887 int pool_pad_x = 0;
888 int pool_pad_y = 0;
889 int pool_stride_x = 0;
890 int pool_stride_y = 0;
891 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
892 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000893 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
894 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100895
896 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
897 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
898 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
899
900 execute_window_loop(window, [&](const Coordinates & id)
901 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100902 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
903 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
904 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
905 float16x4_t res = {};
906
907 // Get power of 2 in case of l2 pooling
908 if(pooling_type == PoolingType::L2)
909 {
910 top_data = vmul_f16(top_data, top_data);
911 middle_data = vmul_f16(middle_data, middle_data);
912 bottom_data = vmul_f16(bottom_data, bottom_data);
913 }
914
915 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100916 {
917 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000918 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100919 const float16x4_t scale_v = vdup_n_f16(scale);
920 // Perform pooling
921 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
922 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
923 res = vmul_f16(vpadd_f16(res, res), scale_v);
924 }
925 else
926 {
927 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
928 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
929 res = vpmax_f16(res, res);
930 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100931
932 // Calculate square-root in case of l2 pooling
933 if(pooling_type == PoolingType::L2)
934 {
935 res = vinv_f16(vinvsqrt_f16(res));
936 }
937
Pablo Tello0c34fe22017-06-26 17:17:42 +0100938 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
939 },
940 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000941#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100942 ARM_COMPUTE_UNUSED(window_input);
943 ARM_COMPUTE_UNUSED(window);
944 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000945#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100946}
947
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000948template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100949void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
950{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000951#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100952 Iterator input(_input, window_input);
953 Iterator output(_output, window);
954 constexpr int pool_size = 2;
955 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
956 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
957 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000958 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
959 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100960
961 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
962 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
963
964 execute_window_loop(window, [&](const Coordinates & id)
965 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100966 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
967 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100968 float16x8_t res = {};
969
Georgios Pinitascdf51452017-08-31 14:21:36 +0100970 // Get power of 2 in case of l2 pooling
971 if(pooling_type == PoolingType::L2)
972 {
973 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
974 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
975 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
976 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
977 }
978
979 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100980 {
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000981 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100982 const float16x8_t scale_v = vdupq_n_f16(scale);
983 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
984 }
985 else
986 {
987 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
988 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100989
990 // Calculate square-root in case of l2 pooling
991 if(pooling_type == PoolingType::L2)
992 {
993 res = vinvq_f16(vinvsqrtq_f16(res));
994 }
995
996 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100997 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
998 },
999 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001000#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001001 ARM_COMPUTE_UNUSED(window_input);
1002 ARM_COMPUTE_UNUSED(window);
1003 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001004#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001005}
1006
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001007template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001008void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
1009{
1010 Iterator input(_input, window_input);
1011 Iterator output(_output, window);
1012
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001013 constexpr int pool_size = 2;
1014 int pool_pad_x = 0;
1015 int pool_pad_y = 0;
1016 int pool_stride_x = 0;
1017 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001018 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1019 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001020 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1021 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001022
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001023 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1024 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001025
1026 execute_window_loop(window, [&](const Coordinates & id)
1027 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001028 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1029 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1030 float32x2_t res = {};
1031 float final_res = 0;
1032
1033 // Get power of 2 in case of l2 pooling
1034 if(pooling_type == PoolingType::L2)
1035 {
1036 top_data = vmul_f32(top_data, top_data);
1037 bottom_data = vmul_f32(bottom_data, bottom_data);
1038 }
1039
1040 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001041 {
1042 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001043 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001044 const float32x2_t scale_v = vdup_n_f32(scale);
1045
1046 // Perform pooling
1047 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1048 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1049 }
1050 else
1051 {
1052 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1053 res = vpmax_f32(max_data, max_data);
1054 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001055 final_res = vget_lane_f32(res, 0);
1056
1057 // Calculate square-root in case of l2 pooling
1058 if(pooling_type == PoolingType::L2)
1059 {
1060 final_res = sqrt(final_res);
1061 }
1062
1063 // Store result
1064 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001065 },
1066 input, output);
1067}
1068
1069template <PoolingType pooling_type>
1070void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
1071{
1072 Iterator input(_input, window_input);
1073 Iterator output(_output, window);
1074
1075 const int fixed_point_position = _input->info()->fixed_point_position();
1076 constexpr int pool_size = 3;
1077 int pool_pad_x = 0;
1078 int pool_pad_y = 0;
1079 int pool_stride_x = 0;
1080 int pool_stride_y = 0;
1081 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1082 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1083 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
1084 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
1085
1086 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1087 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1088 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1089
1090 execute_window_loop(window, [&](const Coordinates & id)
1091 {
1092 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
1093 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
1094 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
1095 qint8x8_t res = {};
1096 if(pooling_type == PoolingType::AVG)
1097 {
1098 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001099 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001100
1101 // Perform pooling for stride 2
1102 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
1103 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
1104 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
1105 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
1106 if(pool_stride_x == 2)
1107 {
1108 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
1109 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001110 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001111 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001112 res = vqmul_qs8(res, scale_vec, fixed_point_position);
1113 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001114 }
1115 else
1116 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001117 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
1118 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001119 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001120 }
1121 else
1122 {
1123 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
1124 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
1125 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
1126 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
1127
1128 if(pool_stride_x == 2)
1129 {
1130 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
1131 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1132 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001133 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001134 }
1135 else
1136 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001137 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001138 }
1139 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001140 },
1141 input, output);
1142}
1143
Georgios Pinitas55186712018-01-08 17:37:12 +00001144template <PoolingType pooling_type, bool exclude_padding>
1145void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
1146{
1147 Iterator input(_input, window_input);
1148 Iterator output(_output, window);
1149
1150 constexpr int pool_size = 3;
1151 int pool_pad_x = 0;
1152 int pool_pad_y = 0;
1153 int pool_stride_x = 0;
1154 int pool_stride_y = 0;
1155 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1156 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1157 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1158 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
1159
1160 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1161 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1162 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1163
1164 execute_window_loop(window, [&](const Coordinates & id)
1165 {
1166 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
1167 const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
1168 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
1169
1170 if(pooling_type == PoolingType::AVG)
1171 {
1172 // Convert data to u16
1173 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
1174 const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
1175 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
1176
1177 // Calculate row sums
1178 const uint16x8x2_t vrsum =
1179 {
1180 {
1181 vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
1182 vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
1183 }
1184 };
1185 const uint16x8x2_t vrsum_shifted_1 =
1186 {
1187 {
1188 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
1189 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
1190 }
1191 };
1192 const uint16x8x2_t vrsum_shifted_2 =
1193 {
1194 {
1195 vextq_u16(vrsum.val[0], vrsum.val[1], 2),
1196 vextq_u16(vrsum.val[1], vrsum.val[1], 2)
1197 }
1198 };
1199 // Calculate final sum
1200 uint16x8x2_t final_sum =
1201 {
1202 {
1203 vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1204 vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1205 }
1206 };
1207 if(pool_stride_x == 2)
1208 {
1209 uint16x8_t res =
1210 {
1211 vgetq_lane_u16(final_sum.val[0], 0),
1212 vgetq_lane_u16(final_sum.val[0], 2),
1213 vgetq_lane_u16(final_sum.val[0], 4),
1214 vgetq_lane_u16(final_sum.val[0], 6),
1215 vgetq_lane_u16(final_sum.val[1], 0),
1216 vgetq_lane_u16(final_sum.val[1], 2),
1217 vgetq_lane_u16(final_sum.val[1], 4),
1218 vgetq_lane_u16(final_sum.val[1], 6),
1219 };
1220
1221 scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
1222 pool_size, upper_bound_w, upper_bound_h,
1223 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1224 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
1225 }
1226 else
1227 {
1228 // Scale lower result
1229 scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
1230 pool_size, upper_bound_w, upper_bound_h,
1231 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1232 // Scale lower result
1233 scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
1234 pool_size, upper_bound_w, upper_bound_h,
1235 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1236 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
1237 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1238 }
1239 }
1240 else
1241 {
1242 const uint8x16_t max_data = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
1243 const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
1244 const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
1245 const uint8x16_t final_max = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
1246
1247 if(pool_stride_x == 2)
1248 {
1249 const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
1250 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1251 const uint8x8_t res = vtbl2_u8(table, lookup_val);
1252 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1253 }
1254 else
1255 {
1256 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
1257 }
1258 }
1259 },
1260 input, output);
1261}
1262
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001263template <PoolingType pooling_type>
1264void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
1265{
1266 Iterator input(_input, window_input);
1267 Iterator output(_output, window);
1268
1269 const int fixed_point_position = _input->info()->fixed_point_position();
1270 constexpr int pool_size = 3;
1271 int pool_pad_x = 0;
1272 int pool_pad_y = 0;
1273 int pool_stride_x = 0;
1274 int pool_stride_y = 0;
1275 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1276 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1277 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
1278 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
1279
1280 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1281 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1282 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1283
1284 execute_window_loop(window, [&](const Coordinates & id)
1285 {
1286 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
1287 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
1288 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
1289
1290 if(pooling_type == PoolingType::AVG)
1291 {
1292 // Calculate scale
1293 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
1294
1295 // Perform pooling for stride 2
1296 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
1297 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
1298 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
1299 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
1300 if(pool_stride_x == 2)
1301 {
1302 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
1303 const qint16x4_t scale_vec = vdup_n_qs16(scale);
1304 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
1305 }
1306 else
1307 {
1308 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
1309 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
1310 }
1311 }
1312 else
1313 {
1314 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
1315 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
1316 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
1317 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
1318
1319 if(pool_stride_x == 2)
1320 {
1321 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
1322 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
1323 }
1324 else
1325 {
1326 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
1327 }
1328 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001329 },
1330 input, output);
1331}
1332
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001333template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001334void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
1335{
1336 Iterator input(_input, window_input);
1337 Iterator output(_output, window);
1338
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001339 constexpr const int pool_size = 3;
1340 int pool_pad_x = 0;
1341 int pool_pad_y = 0;
1342 int pool_stride_x = 0;
1343 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001344 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1345 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001346 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1347 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001348
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001349 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1350 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1351 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001352
1353 execute_window_loop(window, [&](const Coordinates & id)
1354 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001355 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1356 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1357 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1358 float32x2_t res = {};
1359 float final_res = 0;
1360
1361 // Get power of 2 in case of l2 pooling
1362 if(pooling_type == PoolingType::L2)
1363 {
1364 top_data = vmulq_f32(top_data, top_data);
1365 middle_data = vmulq_f32(middle_data, middle_data);
1366 bottom_data = vmulq_f32(bottom_data, bottom_data);
1367 }
1368
1369 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001370 {
1371 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001372 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001373 const float32x2_t scale_v = vdup_n_f32(scale);
1374
1375 // Perform pooling
1376 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1377 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1378 res = vmul_f32(vpadd_f32(res, res), scale_v);
1379 }
1380 else
1381 {
1382 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1383 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1384 res = vpmax_f32(res, res);
1385 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001386 final_res = vget_lane_f32(res, 0);
1387
1388 // Calculate square-root in case of l2 pooling
1389 if(pooling_type == PoolingType::L2)
1390 {
1391 final_res = sqrt(final_res);
1392 }
1393
1394 // Store result
1395 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001396 },
1397 input, output);
1398}
1399
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001400template <PoolingType pooling_type, bool exclude_padding>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001401void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
1402{
1403 Iterator input(_input, window_input);
1404 Iterator output(_output, window);
1405
1406 constexpr const int pool_size = 7;
1407 int pool_pad_x = 0;
1408 int pool_pad_y = 0;
1409 int pool_stride_x = 0;
1410 int pool_stride_y = 0;
1411 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1412 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001413 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1414 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001415
1416 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1417 for(int i = 0; i < pool_size; ++i)
1418 {
1419 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
1420 }
1421
1422 execute_window_loop(window, [&](const Coordinates & id)
1423 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001424 float32x2_t res = {};
1425 float final_res = 0.f;
1426 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001427 {
1428 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001429 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001430 const float32x2_t scale_v = vdup_n_f32(scale);
1431
1432 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +01001433 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1434 // Get power of 2 in case of l2 pooling
1435 if(pooling_type == PoolingType::L2)
1436 {
1437 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1438 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1439 }
1440 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001441 for(int i = 1; i < pool_size; ++i)
1442 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001443 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1444 // Get power of 2 in case of l2 pooling
1445 if(pooling_type == PoolingType::L2)
1446 {
1447 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1448 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1449 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001450 sum_data = vaddq_f32(sum_data, data.val[0]);
1451 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1452 }
1453 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1454 res = vmul_f32(vpadd_f32(res, res), scale_v);
1455 }
1456 else
1457 {
1458 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1459 for(int i = 1; i < pool_size; ++i)
1460 {
1461 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1462 max_data = vmax2q_f32(max_data, data);
1463 }
1464 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1465 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1466 res = vpmax_f32(res, res);
1467 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001468 final_res = vget_lane_f32(res, 0);
1469
1470 // Calculate square-root in case of l2 pooling
1471 if(pooling_type == PoolingType::L2)
1472 {
1473 final_res = sqrt(final_res);
1474 }
1475
1476 // Store result
1477 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001478 },
1479 input, output);
1480}
1481
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001482template <PoolingType pooling_type, bool exclude_padding>
Gian Marco Iodice16824302017-09-28 15:41:37 +01001483void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
1484{
1485 Iterator input(_input, window_input);
1486 Iterator output(_output, window);
1487
Georgios Pinitas4c2dd542017-11-13 12:58:41 +00001488 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001489 int pool_pad_x = 0;
1490 int pool_pad_y = 0;
1491 int pool_stride_x = 0;
1492 int pool_stride_y = 0;
1493 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1494 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001495 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1496 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001497
1498 execute_window_loop(window, [&](const Coordinates & id)
1499 {
1500 float res = 0.0f;
1501
1502 if(pooling_type != PoolingType::MAX)
1503 {
1504 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001505 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001506
1507 // Perform pooling
1508 float32x4_t vres = vdupq_n_f32(0.0f);
1509
1510 for(int y = 0; y < pool_size; ++y)
1511 {
1512 int x = 0;
1513 for(; x <= (pool_size - 4); x += 4)
1514 {
1515 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1516 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1517
1518 // Get power of 2 in case of l2 pooling and accumulate
1519 if(pooling_type == PoolingType::L2)
1520 {
1521 vres = vmlaq_f32(vres, data, data);
1522 }
1523 else
1524 {
1525 vres = vaddq_f32(vres, data);
1526 }
1527 }
1528
1529 // Leftover for loop
1530 for(; x < pool_size; ++x)
1531 {
1532 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1533
1534 // Get power of 2 in case of l2 pooling
1535 if(pooling_type == PoolingType::L2)
1536 {
1537 data *= data;
1538 }
1539
1540 res += data;
1541 }
1542 }
1543
1544#if defined(__aarch64__)
1545 // Reduction operation available on 64 bit architectures only
1546 res += vaddvq_f32(vres);
1547#else // __aarch64__
1548 // Reduction
1549 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1550 tmp = vpadd_f32(tmp, tmp);
1551
1552 res += vget_lane_f32(tmp, 0);
1553#endif // __aarch64__
1554 // Divide by scale
1555 res *= scale;
1556 }
1557 else
1558 {
1559 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
1560 res = std::numeric_limits<float>::min();
1561
1562 for(int y = 0; y < pool_size; ++y)
1563 {
1564 int x = 0;
1565 for(; x <= (pool_size - 4); x += 4)
1566 {
1567 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1568 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1569 vres = vmaxq_f32(vres, data);
1570 }
1571
1572 // Leftover for loop
1573 for(; x < pool_size; ++x)
1574 {
1575 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1576 res = std::max(res, data);
1577 }
1578 }
1579
1580#if defined(__aarch64__)
1581 // Reduction operation available on 64 bit architectures only
1582 res = std::max(vmaxvq_f32(vres), res);
1583#else // __aarch64__
1584 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1585 tmp = vpmax_f32(tmp, tmp);
1586
1587 res = std::max(res, vget_lane_f32(tmp, 0));
1588#endif // __aarch64__
1589 }
1590
1591 // Calculate square-root in case of l2 pooling
1592 if(pooling_type == PoolingType::L2)
1593 {
1594 res = std::sqrt(res);
1595 }
1596
1597 // Store result
1598 *(reinterpret_cast<float *>(output.ptr())) = res;
1599 },
1600 input, output);
1601}
1602
Georgios Pinitas55186712018-01-08 17:37:12 +00001603template <PoolingType pooling_type, bool exclude_padding>
1604void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
1605{
1606 Iterator input(_input, window_input);
1607 Iterator output(_output, window);
1608
1609 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
1610 int pool_pad_x = 0;
1611 int pool_pad_y = 0;
1612 int pool_stride_x = 0;
1613 int pool_stride_y = 0;
1614 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1615 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1616 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1617 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
1618
1619 execute_window_loop(window, [&](const Coordinates & id)
1620 {
1621 uint8_t res = 0;
1622
1623 if(pooling_type != PoolingType::MAX)
1624 {
1625 uint32x4_t vres = vdupq_n_u32(0);
1626 uint32_t sres = 0;
1627
1628 // Calculate scale
1629 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1630
1631 // Perform pooling
1632 for(int y = 0; y < pool_size; ++y)
1633 {
1634 int x = 0;
1635 for(; x <= (pool_size - 8); x += 8)
1636 {
1637 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1638
1639 const uint16x8_t data_u16 = vmovl_u8(data);
1640 vres = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
1641 }
1642
1643 // Leftover for loop
1644 for(; x < pool_size; ++x)
1645 {
1646 uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1647 sres += data;
1648 }
1649 }
1650
1651 // Reduction
1652 const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
1653 sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
1654
1655 // Divide by scale
1656 res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
1657 }
1658 else
1659 {
1660 uint8x8_t vres = vdup_n_u8(0);
1661 res = 0;
1662
1663 for(int y = 0; y < pool_size; ++y)
1664 {
1665 int x = 0;
1666 for(; x <= (pool_size - 8); x += 8)
1667 {
1668 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1669 vres = vmax_u8(vres, data);
1670 }
1671
1672 // Leftover for loop
1673 for(; x < pool_size; ++x)
1674 {
1675 const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1676 res = std::max(res, data);
1677 }
1678 }
1679
1680 // Reduce max
1681 vres = vpmax_u8(vres, vres);
1682 vres = vpmax_u8(vres, vres);
1683 vres = vpmax_u8(vres, vres);
1684
1685 // Get max value
1686 res = std::max(res, vget_lane_u8(vres, 0));
1687 }
1688
1689 // Store result
1690 *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
1691 },
1692 input, output);
1693}
1694
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001695Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1696{
1697 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1698
1699 unsigned int pooled_w = 0;
1700 unsigned int pooled_h = 0;
1701 unsigned int num_elems_processed_per_iteration = 0;
1702 BorderSize border_size(0);
1703
1704 const bool is_global_pooling = pool_info.is_global_pooling();
1705 const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
1706
1707 // Validate pool info befor calling scaled_dimensions
1708 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
1709
1710 // Check output dimensions
1711 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
1712 input->dimension(1),
1713 pool_size,
1714 pool_size,
1715 pool_info.pad_stride_info());
1716
1717 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
1718 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
1719
1720 return Status{};
1721}
1722
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001723void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001724{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001725 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001726 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1727 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1728 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1729
Pablo Tello0c34fe22017-06-26 17:17:42 +01001730 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1731 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Georgios Pinitas55186712018-01-08 17:37:12 +00001732 const unsigned int pool_size = _pool_info.pool_size();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001733
1734 // Set step for input in x and y direction for the input
1735 Window window_input(window);
1736 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001737 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001738 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001739 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001740 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001741 case DataType::F16:
1742 {
1743 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1744 break;
1745 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001746 case DataType::QASYMM8:
1747 {
1748 window_x_inc = pool_stride_x;
1749 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
1750 {
1751 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1752 }
1753 break;
1754 }
Pablo Tello0c34fe22017-06-26 17:17:42 +01001755 case DataType::F32:
1756 {
1757 window_x_inc = pool_stride_x;
1758 break;
1759 }
1760 default:
1761 {
1762 ARM_COMPUTE_ERROR("Not supported");
1763 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001764 }
1765 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1766 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1767
1768 // Run function
1769 (this->*_func)(window_input, window);
1770}