blob: be5fa4cc4c32dd561ef196518124ac2f46fbaa30 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas55186712018-01-08 17:37:12 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
38
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <algorithm>
42#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010043#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010045#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <string>
47#include <tuple>
48
49using namespace arm_compute;
50
51namespace
52{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000053void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
54{
55 TensorShape output_shape{ input->tensor_shape() };
56 output_shape.set(0, pooled_w);
57 output_shape.set(1, pooled_h);
58
59 auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
60}
61
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000062template <bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010063inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
64 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
65{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000066 int start_x = id.x() * stride_x - pad_x;
67 int start_y = id.y() * stride_y - pad_y;
Pablo Tello0c34fe22017-06-26 17:17:42 +010068 const int end_x = std::min(start_x + pool_size, upper_bound_w);
69 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000070 if(exclude_padding)
71 {
72 start_x = std::max(0, start_x);
73 start_y = std::max(0, start_y);
74 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 return 1.f / ((end_y - start_y) * (end_x - start_x));
76}
77
78inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
79 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
80{
Pablo Tello0c34fe22017-06-26 17:17:42 +010081 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
83 const int start_x = id.x() * stride_x - pad_x;
84 const int start_y = id.y() * stride_y - pad_y;
85 const int end_x = std::min(start_x + pool_size, upper_bound_w);
86 const int end_y = std::min(start_y + pool_size, upper_bound_h);
87 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010088 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
89}
90
91inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
92 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
93{
94 static std::array<qint16_t, 10> scale_values_q16 =
95 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
96 const int start_x = id.x() * stride_x - pad_x;
97 const int start_y = id.y() * stride_y - pad_y;
98 const int end_x = std::min(start_x + pool_size, upper_bound_w);
99 const int end_y = std::min(start_y + pool_size, upper_bound_h);
100 const int val = ((end_y - start_y) * (end_x - start_x));
101 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103
Georgios Pinitas55186712018-01-08 17:37:12 +0000104template <bool exclude_padding>
105inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
106 const int pool_size, const int upper_bound_w, const int upper_bound_h,
107 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
108{
109 int start_x = (id.x() + id_offset) * stride_x - pad_x;
110 int start_y = id.y() * stride_y - pad_y;
111 const int end_y = std::min(start_y + pool_size, upper_bound_h);
112 if(exclude_padding)
113 {
114 start_y = std::max(0, start_y);
115 }
116
117 std::array<uint16_t, 8> elems =
118 {
119 {
120 vgetq_lane_u16(v, 0),
121 vgetq_lane_u16(v, 1),
122 vgetq_lane_u16(v, 2),
123 vgetq_lane_u16(v, 3),
124 vgetq_lane_u16(v, 4),
125 vgetq_lane_u16(v, 5),
126 vgetq_lane_u16(v, 6),
127 vgetq_lane_u16(v, 7),
128 }
129 };
130
131 for(auto &el : elems)
132 {
133 int c_start_x = start_x;
134 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
135 if(exclude_padding)
136 {
137 c_start_x = std::max(0, c_start_x);
138 }
139 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
140 el *= scale;
141 start_x += step * stride_x;
142 }
143
144 v = vsetq_lane_u16(elems[0], v, 0);
145 v = vsetq_lane_u16(elems[1], v, 1);
146 v = vsetq_lane_u16(elems[2], v, 2);
147 v = vsetq_lane_u16(elems[3], v, 3);
148 v = vsetq_lane_u16(elems[4], v, 4);
149 v = vsetq_lane_u16(elems[5], v, 5);
150 v = vsetq_lane_u16(elems[6], v, 6);
151 v = vsetq_lane_u16(elems[7], v, 7);
152}
153
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000154Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000156 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000158 int pool_stride_x = 0;
159 int pool_stride_y = 0;
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000160 PoolingType pool_type = pool_info.pool_type();
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000161 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
162 const bool exclude_padding = pool_info.exclude_padding();
163 const bool is_global_pooling = pool_info.is_global_pooling();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100164 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Gian Marco Iodice16824302017-09-28 15:41:37 +0100165 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100166
Georgios Pinitas55186712018-01-08 17:37:12 +0000167 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
168 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
169 ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000170 ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
171 ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
172 ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000174 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100175 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000176 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
177 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
178 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100179 }
180
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000181 return Status{};
182}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000184Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
185{
186 const bool is_global_pooling = pool_info.is_global_pooling();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000187 ARM_COMPUTE_UNUSED(pool_size);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
189 "Global pooling is supported only with rectangular inputs!");
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000190
191 return Status{};
192}
193
194std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
195 BorderSize &border_size,
196 unsigned int pooled_w, unsigned int pooled_h, int pool_size)
197{
198 unsigned int num_elems_read_per_iteration = 0;
199 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000200 int pool_stride_x = 0;
201 int pool_stride_y = 0;
202 const int input_width = input->dimension(0);
203 const int input_height = input->dimension(1);
204 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
205 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000206 const int pool_pad_right = pad_stride_info.pad_right();
207 const int pool_pad_top = pad_stride_info.pad_top();
208 const int pool_pad_left = pad_stride_info.pad_left();
209 const int pool_pad_bottom = pad_stride_info.pad_bottom();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000210
211 // Check output dimensions
212 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
213 input->dimension(1),
214 pool_size,
215 pool_size,
216 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217
218 // Select element size
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000219 switch(input->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100220 {
221 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100222 num_elems_read_per_iteration = 16;
223 switch(pool_size)
224 {
225 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100226 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100227 break;
228 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100229 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100230 break;
231 default:
232 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100233 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100234 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100235 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
236 break;
Georgios Pinitas55186712018-01-08 17:37:12 +0000237 case DataType::QASYMM8:
238 switch(pool_size)
239 {
240 case 2:
241 num_elems_read_per_iteration = 16;
242 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
243 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
244 break;
245 case 3:
246 num_elems_read_per_iteration = 16;
247 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
248 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
249 break;
250 default:
251 num_elems_read_per_iteration = 1;
252 num_elems_processed_per_iteration = 1;
253 num_elems_horizontal_window = 1;
254 break;
255 }
256 break;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100257 case DataType::QS16:
258 num_elems_read_per_iteration = 8;
259 switch(pool_size)
260 {
261 case 2:
262 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
263 break;
264 case 3:
265 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
266 break;
267 default:
268 ARM_COMPUTE_ERROR("Pooling size not supported");
269 }
270 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100271 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000272#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100273 case DataType::F16:
274 switch(pool_size)
275 {
276 case 2:
277 num_elems_read_per_iteration = 16;
278 num_elems_processed_per_iteration = 8;
279 num_elems_horizontal_window = 8;
280 break;
281 case 3:
282 num_elems_read_per_iteration = 4;
283 num_elems_processed_per_iteration = 1;
284 num_elems_horizontal_window = 1;
285 break;
286 default:
287 ARM_COMPUTE_ERROR("Pooling size not supported");
288 break;
289 }
290 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000291#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100292 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100293 switch(pool_size)
294 {
295 case 2:
296 num_elems_read_per_iteration = 2;
297 break;
298 case 3:
299 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
300 break;
301 case 7:
302 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
303 break;
304 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100305 num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
Pablo Tello0c34fe22017-06-26 17:17:42 +0100306 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100307 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100308 num_elems_processed_per_iteration = 1;
309 num_elems_horizontal_window = 1;
310 break;
311 default:
312 ARM_COMPUTE_ERROR("Element size not supported");
313 break;
314 }
315
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000316 // Number of iterations in X dimension
317 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
318
319 // Upper limit for the number of right/bottom border elements that are accessed
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000320 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
321 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size) - input_height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000322
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000323 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
324 border_size.right = std::max(upper_bound_w, pool_pad_right);
325 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000326 bool window_changed = false;
327
328 TensorShape output_shape{ input->tensor_shape() };
329 output_shape.set(0, pooled_w);
330 output_shape.set(1, pooled_h);
331 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
332
333 Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000334 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000335
336 if(output->total_size() != 0)
337 {
338 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
339 window_changed = update_window_and_padding(win, input_access, output_access);
340 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
341 }
342 else
343 {
344 window_changed = update_window_and_padding(win, input_access);
345 }
346
347 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
348 return std::make_pair(err, win);
349}
350} // namespace
351
352NEPoolingLayerKernel::NEPoolingLayerKernel()
353 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
354{
355}
356
357BorderSize NEPoolingLayerKernel::border_size() const
358{
359 return _border_size;
360}
361
362void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
363{
364 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
365
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000366 const PoolingType pool_type = pool_info.pool_type();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000367 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
368 const bool exclude_padding = pool_info.exclude_padding();
369 const bool is_global_pooling = pool_info.is_global_pooling();
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000370 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000371
372 // Update pool size in case of global pooling
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000373 const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000374
375 // Validate pool info before calling scaled_dimensions
376 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
377
378 // Check output dimensions
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000379 unsigned int pooled_w, pooled_h;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000380 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
381 input->info()->dimension(1),
382 pool_size,
383 pool_size,
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000384 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000385
386 // Output auto initialization if not yet initialized
387 auto_init(input->info(), output->info(), pooled_w, pooled_h);
388
389 // Perform validation step
390 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100391
392 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000393 _input = input;
394 _output = output;
395 _pool_info = pool_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100396
Georgios Pinitas55186712018-01-08 17:37:12 +0000397 // Get data type
398 const DataType data_type = input->info()->data_type();
399
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100400 // Select appropriate function
Georgios Pinitas55186712018-01-08 17:37:12 +0000401 if(data_type == DataType::QS8)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100402 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000403 switch(pool_size)
404 {
405 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100406 switch(pool_type)
407 {
408 case PoolingType::AVG:
409 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
410 break;
411 case PoolingType::MAX:
412 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
413 break;
414 default:
415 ARM_COMPUTE_ERROR("Unsupported pooling type!");
416 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000417 break;
418 case 3:
419 switch(pool_type)
420 {
421 case PoolingType::AVG:
422 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
423 break;
424 case PoolingType::MAX:
425 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
426 break;
427 default:
428 ARM_COMPUTE_ERROR("Unsupported pooling type!");
429 }
430 break;
431 default:
432 ARM_COMPUTE_ERROR("Unsupported pooling size!");
433 }
434 }
435 else if(data_type == DataType::QASYMM8)
436 {
437 if(pool_size == 2 && pool_stride_x < 3)
438 {
439 switch(pool_type)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100440 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000441 case PoolingType::AVG:
442 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
443 break;
444 case PoolingType::MAX:
445 _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
446 break;
447 default:
448 ARM_COMPUTE_ERROR("Unsupported pooling type!");
449 }
450 }
451 else if(pool_size == 3 && pool_stride_x < 3)
452 {
453 switch(pool_type)
454 {
455 case PoolingType::AVG:
456 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
457 break;
458 case PoolingType::MAX:
459 _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
460 break;
461 default:
462 ARM_COMPUTE_ERROR("Unsupported pooling type!");
463 }
464 }
465 else
466 {
467 switch(pool_type)
468 {
469 case PoolingType::AVG:
470 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
471 break;
472 case PoolingType::MAX:
473 _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
474 break;
475 default:
476 ARM_COMPUTE_ERROR("Unsupported pooling type!");
477 }
478 }
479 }
480 else if(data_type == DataType::QS16)
481 {
482 switch(pool_size)
483 {
484 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100485 switch(pool_type)
486 {
487 case PoolingType::AVG:
488 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
489 break;
490 case PoolingType::MAX:
491 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
492 break;
493 default:
494 ARM_COMPUTE_ERROR("Unsupported pooling type!");
495 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000496 break;
497 case 3:
498 switch(pool_type)
499 {
500 case PoolingType::AVG:
501 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
502 break;
503 case PoolingType::MAX:
504 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
505 break;
506 default:
507 ARM_COMPUTE_ERROR("Unsupported pooling type!");
508 }
509 break;
510 default:
511 ARM_COMPUTE_ERROR("Unsupported pooling size!");
512 }
513 }
514 else if(data_type == DataType::F16)
515 {
516 switch(pool_size)
517 {
518 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100519 switch(pool_type)
520 {
521 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000522 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100523 break;
524 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000525 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100526 break;
527 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000528 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100529 break;
530 default:
531 ARM_COMPUTE_ERROR("Unsupported pooling type!");
532 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000533 break;
534 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100535 switch(pool_type)
536 {
537 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000538 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100539 break;
540 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000541 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100542 break;
543 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000544 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100545 break;
546 default:
547 ARM_COMPUTE_ERROR("Unsupported pooling type!");
548 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000549 break;
550 default:
551 ARM_COMPUTE_ERROR("Unsupported pooling size!");
552 }
553 }
554 else if(data_type == DataType::F32)
555 {
556 switch(pool_size)
557 {
558 case 2:
559 switch(pool_type)
560 {
561 case PoolingType::AVG:
562 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
563 break;
564 case PoolingType::L2:
565 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
566 break;
567 case PoolingType::MAX:
568 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
569 break;
570 default:
571 ARM_COMPUTE_ERROR("Unsupported pooling type!");
572 }
573 break;
574 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100575 switch(pool_type)
576 {
577 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000578 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100579 break;
580 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000581 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100582 break;
583 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000584 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100585 break;
586 default:
587 ARM_COMPUTE_ERROR("Unsupported pooling type!");
588 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000589 break;
590 case 7:
591 switch(pool_type)
592 {
593 case PoolingType::AVG:
594 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
595 break;
596 case PoolingType::L2:
597 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
598 break;
599 case PoolingType::MAX:
600 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
601 break;
602 default:
603 ARM_COMPUTE_ERROR("Unsupported pooling type!");
604 }
605 break;
606 default:
607 switch(pool_type)
608 {
609 case PoolingType::AVG:
610 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
611 break;
612 case PoolingType::L2:
613 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
614 break;
615 case PoolingType::MAX:
616 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
617 break;
618 default:
619 ARM_COMPUTE_ERROR("Unsupported pooling type!");
620 }
621 break;
622 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100623 }
624
625 // Configure kernel window
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000626 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
627 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
628 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100629}
630
631template <PoolingType pooling_type>
632void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
633{
634 Iterator input(_input, window_input);
635 Iterator output(_output, window);
636
637 const int fixed_point_position = _input->info()->fixed_point_position();
638 constexpr int pool_size = 2;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100639 int pool_stride_x = 0;
640 int pool_stride_y = 0;
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000641 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
642 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
643 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
644 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100645 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000646 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
647 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100648
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000649 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
650 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100651
652 execute_window_loop(window, [&](const Coordinates & id)
653 {
654 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
655 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100656 qint8x8_t lower_res = {};
657 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100658 if(pooling_type == PoolingType::AVG)
659 {
660 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000661 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100662 const qint8x8_t scale_vec = vdup_n_qs8(scale);
663
664 // Perform pooling
665 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100666 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
667 if(pool_stride_x == 1)
668 {
669 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
670 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
671 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100672 }
673 else
674 {
675 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100676 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
677 if(pool_stride_x == 1)
678 {
679 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
680 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
681 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100682 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100683 if(pool_stride_x == 1)
684 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100685 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100686 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
687 }
688 else
689 {
690 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
691 }
692 },
693 input, output);
694}
695
Georgios Pinitas55186712018-01-08 17:37:12 +0000696template <PoolingType pooling_type, bool exclude_padding>
697void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
698{
699 Iterator input(_input, window_input);
700 Iterator output(_output, window);
701
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000702 constexpr int pool_size = 2;
703 int pool_stride_x = 0;
704 int pool_stride_y = 0;
705 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
706 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
707 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
708 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Georgios Pinitas55186712018-01-08 17:37:12 +0000709 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000710 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
711 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000712
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000713 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
714 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Georgios Pinitas55186712018-01-08 17:37:12 +0000715
716 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
717
718 execute_window_loop(window, [&](const Coordinates & id)
719 {
720 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
721 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
722 uint8x8_t lower_res = {};
723 uint8x8_t upper_res = {};
724
725 if(pooling_type != PoolingType::MAX)
726 {
727 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
728 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
729
730 // Add rows
731 const uint16x8x2_t vrsum =
732 {
733 {
734 vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
735 vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
736 }
737 };
738
739 // Pair-wise add row data
740 const uint16x4x2_t vpsum =
741 {
742 {
743 vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
744 vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
745 }
746 };
747
748 uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
749
750 // Scale lower result
751 scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
752 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000753 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000754 lower_res = vmovn_u16(res_lower);
755
756 // Compute upper result for stride_x == 1
757 if(pool_stride_x == 1)
758 {
759 // Shifted row sum
760 const uint16x8x2_t vrsum_shifted =
761 {
762 {
763 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
764 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
765 }
766 };
767
768 // Pair-wise add shifted row
769 const uint16x4x2_t vpsum_shifted =
770 {
771 {
772 vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
773 vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
774 }
775 };
776 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
777
778 // Scale lower result
779 scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
780 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000781 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000782 upper_res = vmovn_u16(res_upper);
783 }
784 }
785 else
786 {
787 const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
788 lower_res = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
789 if(pool_stride_x == 1)
790 {
791 const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
792 upper_res = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
793 }
794 }
795
796 // Store result
797 if(pool_stride_x == 1)
798 {
799 const uint8x8x2_t res = { { lower_res, upper_res } };
800 vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
801 }
802 else
803 {
804 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
805 }
806 },
807 input, output);
808}
809
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100810template <PoolingType pooling_type>
811void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
812{
813 Iterator input(_input, window_input);
814 Iterator output(_output, window);
815
816 const int fixed_point_position = _input->info()->fixed_point_position();
817 constexpr int pool_size = 2;
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000818 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
819 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
820 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
821 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100822 int pool_stride_x = 0;
823 int pool_stride_y = 0;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100824 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000825 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
826 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100827
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000828 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
829 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100830
831 execute_window_loop(window, [&](const Coordinates & id)
832 {
833 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
834 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
835 qint16x4_t lower_res = {};
836 qint16x4_t upper_res = {};
837 if(pooling_type == PoolingType::AVG)
838 {
839 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000840 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100841 const qint16x4_t scale_vec = vdup_n_qs16(scale);
842
843 // Perform pooling
844 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
845 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
846 if(pool_stride_x == 1)
847 {
848 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
849 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
850 }
851 }
852 else
853 {
854 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
855 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
856 if(pool_stride_x == 1)
857 {
858 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
859 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
860 }
861 }
862 if(pool_stride_x == 1)
863 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100864 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100865 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
866 }
867 else
868 {
869 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
870 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100871 },
872 input, output);
873}
874
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000875template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100876void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
877{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000878#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100879 Iterator input(_input, window_input);
880 Iterator output(_output, window);
881
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000882 constexpr const int pool_size = 3;
883 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
884 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
885 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
886 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
887 int pool_stride_x = 0;
888 int pool_stride_y = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100889 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000890 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
891 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100892
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000893 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
894 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
895 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100896
897 execute_window_loop(window, [&](const Coordinates & id)
898 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100899 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
900 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
901 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
902 float16x4_t res = {};
903
904 // Get power of 2 in case of l2 pooling
905 if(pooling_type == PoolingType::L2)
906 {
907 top_data = vmul_f16(top_data, top_data);
908 middle_data = vmul_f16(middle_data, middle_data);
909 bottom_data = vmul_f16(bottom_data, bottom_data);
910 }
911
912 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100913 {
914 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000915 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100916 const float16x4_t scale_v = vdup_n_f16(scale);
917 // Perform pooling
918 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
919 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
920 res = vmul_f16(vpadd_f16(res, res), scale_v);
921 }
922 else
923 {
924 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
925 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
926 res = vpmax_f16(res, res);
927 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100928
929 // Calculate square-root in case of l2 pooling
930 if(pooling_type == PoolingType::L2)
931 {
932 res = vinv_f16(vinvsqrt_f16(res));
933 }
934
Pablo Tello0c34fe22017-06-26 17:17:42 +0100935 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
936 },
937 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000938#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100939 ARM_COMPUTE_UNUSED(window_input);
940 ARM_COMPUTE_UNUSED(window);
941 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000942#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100943}
944
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000945template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100946void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
947{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000948#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100949 Iterator input(_input, window_input);
950 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000951 constexpr int pool_size = 2;
952 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
953 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
954 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
955 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
956 int pool_stride_x, pool_stride_y = 0;
957 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
958 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
959 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100960
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000961 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
962 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100963
964 execute_window_loop(window, [&](const Coordinates & id)
965 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100966 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
967 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100968 float16x8_t res = {};
969
Georgios Pinitascdf51452017-08-31 14:21:36 +0100970 // Get power of 2 in case of l2 pooling
971 if(pooling_type == PoolingType::L2)
972 {
973 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
974 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
975 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
976 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
977 }
978
979 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100980 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000981 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100982 const float16x8_t scale_v = vdupq_n_f16(scale);
983 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
984 }
985 else
986 {
987 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
988 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100989
990 // Calculate square-root in case of l2 pooling
991 if(pooling_type == PoolingType::L2)
992 {
993 res = vinvq_f16(vinvsqrtq_f16(res));
994 }
995
996 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100997 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
998 },
999 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001000#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001001 ARM_COMPUTE_UNUSED(window_input);
1002 ARM_COMPUTE_UNUSED(window);
1003 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001004#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001005}
1006
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001007template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001008void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
1009{
1010 Iterator input(_input, window_input);
1011 Iterator output(_output, window);
1012
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001013 constexpr int pool_size = 2;
1014 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1015 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1016 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1017 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1018 int pool_stride_x = 0;
1019 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001020 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001021 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1022 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001023
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001024 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1025 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001026
1027 execute_window_loop(window, [&](const Coordinates & id)
1028 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001029 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1030 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1031 float32x2_t res = {};
1032 float final_res = 0;
1033
1034 // Get power of 2 in case of l2 pooling
1035 if(pooling_type == PoolingType::L2)
1036 {
1037 top_data = vmul_f32(top_data, top_data);
1038 bottom_data = vmul_f32(bottom_data, bottom_data);
1039 }
1040
1041 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001042 {
1043 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001044 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001045 const float32x2_t scale_v = vdup_n_f32(scale);
1046
1047 // Perform pooling
1048 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1049 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1050 }
1051 else
1052 {
1053 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1054 res = vpmax_f32(max_data, max_data);
1055 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001056 final_res = vget_lane_f32(res, 0);
1057
1058 // Calculate square-root in case of l2 pooling
1059 if(pooling_type == PoolingType::L2)
1060 {
1061 final_res = sqrt(final_res);
1062 }
1063
1064 // Store result
1065 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001066 },
1067 input, output);
1068}
1069
1070template <PoolingType pooling_type>
1071void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
1072{
1073 Iterator input(_input, window_input);
1074 Iterator output(_output, window);
1075
1076 const int fixed_point_position = _input->info()->fixed_point_position();
1077 constexpr int pool_size = 3;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001078 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1079 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1080 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1081 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001082 int pool_stride_x = 0;
1083 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001084 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001085 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
1086 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001087
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001088 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1089 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1090 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001091
1092 execute_window_loop(window, [&](const Coordinates & id)
1093 {
1094 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
1095 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
1096 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
1097 qint8x8_t res = {};
1098 if(pooling_type == PoolingType::AVG)
1099 {
1100 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001101 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001102
1103 // Perform pooling for stride 2
1104 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
1105 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
1106 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
1107 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
1108 if(pool_stride_x == 2)
1109 {
1110 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
1111 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001112 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001113 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001114 res = vqmul_qs8(res, scale_vec, fixed_point_position);
1115 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001116 }
1117 else
1118 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001119 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
1120 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001121 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001122 }
1123 else
1124 {
1125 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
1126 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
1127 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
1128 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
1129
1130 if(pool_stride_x == 2)
1131 {
1132 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
1133 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1134 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001135 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001136 }
1137 else
1138 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001139 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001140 }
1141 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001142 },
1143 input, output);
1144}
1145
Georgios Pinitas55186712018-01-08 17:37:12 +00001146template <PoolingType pooling_type, bool exclude_padding>
1147void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
1148{
1149 Iterator input(_input, window_input);
1150 Iterator output(_output, window);
1151
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001152 constexpr int pool_size = 3;
1153 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1154 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1155 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1156 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1157 int pool_stride_x = 0;
1158 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001159 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001160 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1161 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001162
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001163 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1164 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1165 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Georgios Pinitas55186712018-01-08 17:37:12 +00001166
1167 execute_window_loop(window, [&](const Coordinates & id)
1168 {
1169 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
1170 const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
1171 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
1172
1173 if(pooling_type == PoolingType::AVG)
1174 {
1175 // Convert data to u16
1176 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
1177 const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
1178 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
1179
1180 // Calculate row sums
1181 const uint16x8x2_t vrsum =
1182 {
1183 {
1184 vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
1185 vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
1186 }
1187 };
1188 const uint16x8x2_t vrsum_shifted_1 =
1189 {
1190 {
1191 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
1192 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
1193 }
1194 };
1195 const uint16x8x2_t vrsum_shifted_2 =
1196 {
1197 {
1198 vextq_u16(vrsum.val[0], vrsum.val[1], 2),
1199 vextq_u16(vrsum.val[1], vrsum.val[1], 2)
1200 }
1201 };
1202 // Calculate final sum
1203 uint16x8x2_t final_sum =
1204 {
1205 {
1206 vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1207 vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1208 }
1209 };
1210 if(pool_stride_x == 2)
1211 {
1212 uint16x8_t res =
1213 {
1214 vgetq_lane_u16(final_sum.val[0], 0),
1215 vgetq_lane_u16(final_sum.val[0], 2),
1216 vgetq_lane_u16(final_sum.val[0], 4),
1217 vgetq_lane_u16(final_sum.val[0], 6),
1218 vgetq_lane_u16(final_sum.val[1], 0),
1219 vgetq_lane_u16(final_sum.val[1], 2),
1220 vgetq_lane_u16(final_sum.val[1], 4),
1221 vgetq_lane_u16(final_sum.val[1], 6),
1222 };
1223
1224 scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
1225 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001226 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001227 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
1228 }
1229 else
1230 {
1231 // Scale lower result
1232 scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
1233 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001234 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001235 // Scale lower result
1236 scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
1237 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001238 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001239 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
1240 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1241 }
1242 }
1243 else
1244 {
1245 const uint8x16_t max_data = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
1246 const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
1247 const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
1248 const uint8x16_t final_max = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
1249
1250 if(pool_stride_x == 2)
1251 {
1252 const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
1253 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1254 const uint8x8_t res = vtbl2_u8(table, lookup_val);
1255 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1256 }
1257 else
1258 {
1259 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
1260 }
1261 }
1262 },
1263 input, output);
1264}
1265
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001266template <PoolingType pooling_type>
1267void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
1268{
1269 Iterator input(_input, window_input);
1270 Iterator output(_output, window);
1271
1272 const int fixed_point_position = _input->info()->fixed_point_position();
1273 constexpr int pool_size = 3;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001274 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1275 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1276 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1277 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001278 int pool_stride_x = 0;
1279 int pool_stride_y = 0;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001280 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001281 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
1282 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001283
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001284 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1285 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1286 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001287
1288 execute_window_loop(window, [&](const Coordinates & id)
1289 {
1290 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
1291 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
1292 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
1293
1294 if(pooling_type == PoolingType::AVG)
1295 {
1296 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001297 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001298
1299 // Perform pooling for stride 2
1300 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
1301 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
1302 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
1303 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
1304 if(pool_stride_x == 2)
1305 {
1306 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
1307 const qint16x4_t scale_vec = vdup_n_qs16(scale);
1308 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
1309 }
1310 else
1311 {
1312 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
1313 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
1314 }
1315 }
1316 else
1317 {
1318 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
1319 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
1320 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
1321 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
1322
1323 if(pool_stride_x == 2)
1324 {
1325 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
1326 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
1327 }
1328 else
1329 {
1330 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
1331 }
1332 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001333 },
1334 input, output);
1335}
1336
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001337template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001338void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
1339{
1340 Iterator input(_input, window_input);
1341 Iterator output(_output, window);
1342
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001343 constexpr const int pool_size = 3;
1344 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1345 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1346 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1347 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1348 int pool_stride_x = 0;
1349 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001350 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001351 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1352 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001353
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001354 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1355 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1356 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001357
1358 execute_window_loop(window, [&](const Coordinates & id)
1359 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001360 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1361 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1362 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1363 float32x2_t res = {};
1364 float final_res = 0;
1365
1366 // Get power of 2 in case of l2 pooling
1367 if(pooling_type == PoolingType::L2)
1368 {
1369 top_data = vmulq_f32(top_data, top_data);
1370 middle_data = vmulq_f32(middle_data, middle_data);
1371 bottom_data = vmulq_f32(bottom_data, bottom_data);
1372 }
1373
1374 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001375 {
1376 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001377 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001378 const float32x2_t scale_v = vdup_n_f32(scale);
1379
1380 // Perform pooling
1381 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1382 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1383 res = vmul_f32(vpadd_f32(res, res), scale_v);
1384 }
1385 else
1386 {
1387 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1388 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1389 res = vpmax_f32(res, res);
1390 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001391 final_res = vget_lane_f32(res, 0);
1392
1393 // Calculate square-root in case of l2 pooling
1394 if(pooling_type == PoolingType::L2)
1395 {
1396 final_res = sqrt(final_res);
1397 }
1398
1399 // Store result
1400 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001401 },
1402 input, output);
1403}
1404
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001405template <PoolingType pooling_type, bool exclude_padding>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001406void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
1407{
1408 Iterator input(_input, window_input);
1409 Iterator output(_output, window);
1410
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001411 constexpr const int pool_size = 7;
1412 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1413 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1414 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1415 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1416 int pool_stride_x = 0;
1417 int pool_stride_y = 0;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001418 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001419 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1420 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001421
1422 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1423 for(int i = 0; i < pool_size; ++i)
1424 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001425 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001426 }
1427
1428 execute_window_loop(window, [&](const Coordinates & id)
1429 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001430 float32x2_t res = {};
1431 float final_res = 0.f;
1432 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001433 {
1434 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001435 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001436 const float32x2_t scale_v = vdup_n_f32(scale);
1437
1438 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +01001439 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1440 // Get power of 2 in case of l2 pooling
1441 if(pooling_type == PoolingType::L2)
1442 {
1443 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1444 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1445 }
1446 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001447 for(int i = 1; i < pool_size; ++i)
1448 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001449 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1450 // Get power of 2 in case of l2 pooling
1451 if(pooling_type == PoolingType::L2)
1452 {
1453 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1454 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1455 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001456 sum_data = vaddq_f32(sum_data, data.val[0]);
1457 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1458 }
1459 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1460 res = vmul_f32(vpadd_f32(res, res), scale_v);
1461 }
1462 else
1463 {
1464 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1465 for(int i = 1; i < pool_size; ++i)
1466 {
1467 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1468 max_data = vmax2q_f32(max_data, data);
1469 }
1470 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1471 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1472 res = vpmax_f32(res, res);
1473 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001474 final_res = vget_lane_f32(res, 0);
1475
1476 // Calculate square-root in case of l2 pooling
1477 if(pooling_type == PoolingType::L2)
1478 {
1479 final_res = sqrt(final_res);
1480 }
1481
1482 // Store result
1483 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001484 },
1485 input, output);
1486}
1487
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001488template <PoolingType pooling_type, bool exclude_padding>
Gian Marco Iodice16824302017-09-28 15:41:37 +01001489void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
1490{
1491 Iterator input(_input, window_input);
1492 Iterator output(_output, window);
1493
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001494 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
1495 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1496 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1497 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1498 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1499 int pool_stride_x = 0;
1500 int pool_stride_y = 0;
Gian Marco Iodice16824302017-09-28 15:41:37 +01001501 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001502 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1503 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001504
1505 execute_window_loop(window, [&](const Coordinates & id)
1506 {
1507 float res = 0.0f;
1508
1509 if(pooling_type != PoolingType::MAX)
1510 {
1511 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001512 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001513
1514 // Perform pooling
1515 float32x4_t vres = vdupq_n_f32(0.0f);
1516
1517 for(int y = 0; y < pool_size; ++y)
1518 {
1519 int x = 0;
1520 for(; x <= (pool_size - 4); x += 4)
1521 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001522 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1523 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001524
1525 // Get power of 2 in case of l2 pooling and accumulate
1526 if(pooling_type == PoolingType::L2)
1527 {
1528 vres = vmlaq_f32(vres, data, data);
1529 }
1530 else
1531 {
1532 vres = vaddq_f32(vres, data);
1533 }
1534 }
1535
1536 // Leftover for loop
1537 for(; x < pool_size; ++x)
1538 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001539 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001540
1541 // Get power of 2 in case of l2 pooling
1542 if(pooling_type == PoolingType::L2)
1543 {
1544 data *= data;
1545 }
1546
1547 res += data;
1548 }
1549 }
1550
1551#if defined(__aarch64__)
1552 // Reduction operation available on 64 bit architectures only
1553 res += vaddvq_f32(vres);
1554#else // __aarch64__
1555 // Reduction
1556 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1557 tmp = vpadd_f32(tmp, tmp);
1558
1559 res += vget_lane_f32(tmp, 0);
1560#endif // __aarch64__
1561 // Divide by scale
1562 res *= scale;
1563 }
1564 else
1565 {
1566 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
1567 res = std::numeric_limits<float>::min();
1568
1569 for(int y = 0; y < pool_size; ++y)
1570 {
1571 int x = 0;
1572 for(; x <= (pool_size - 4); x += 4)
1573 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001574 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1575 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001576 vres = vmaxq_f32(vres, data);
1577 }
1578
1579 // Leftover for loop
1580 for(; x < pool_size; ++x)
1581 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001582 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001583 res = std::max(res, data);
1584 }
1585 }
1586
1587#if defined(__aarch64__)
1588 // Reduction operation available on 64 bit architectures only
1589 res = std::max(vmaxvq_f32(vres), res);
1590#else // __aarch64__
1591 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1592 tmp = vpmax_f32(tmp, tmp);
1593
1594 res = std::max(res, vget_lane_f32(tmp, 0));
1595#endif // __aarch64__
1596 }
1597
1598 // Calculate square-root in case of l2 pooling
1599 if(pooling_type == PoolingType::L2)
1600 {
1601 res = std::sqrt(res);
1602 }
1603
1604 // Store result
1605 *(reinterpret_cast<float *>(output.ptr())) = res;
1606 },
1607 input, output);
1608}
1609
Georgios Pinitas55186712018-01-08 17:37:12 +00001610template <PoolingType pooling_type, bool exclude_padding>
1611void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
1612{
1613 Iterator input(_input, window_input);
1614 Iterator output(_output, window);
1615
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001616 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
1617 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1618 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1619 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1620 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1621 int pool_stride_x = 0;
1622 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001623 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001624 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1625 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001626
1627 execute_window_loop(window, [&](const Coordinates & id)
1628 {
1629 uint8_t res = 0;
1630
1631 if(pooling_type != PoolingType::MAX)
1632 {
1633 uint32x4_t vres = vdupq_n_u32(0);
1634 uint32_t sres = 0;
1635
1636 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001637 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001638
1639 // Perform pooling
1640 for(int y = 0; y < pool_size; ++y)
1641 {
1642 int x = 0;
1643 for(; x <= (pool_size - 8); x += 8)
1644 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001645 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1646 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001647
1648 const uint16x8_t data_u16 = vmovl_u8(data);
1649 vres = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
1650 }
1651
1652 // Leftover for loop
1653 for(; x < pool_size; ++x)
1654 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001655 uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001656 sres += data;
1657 }
1658 }
1659
1660 // Reduction
1661 const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
1662 sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
1663
1664 // Divide by scale
1665 res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
1666 }
1667 else
1668 {
1669 uint8x8_t vres = vdup_n_u8(0);
1670 res = 0;
1671
1672 for(int y = 0; y < pool_size; ++y)
1673 {
1674 int x = 0;
1675 for(; x <= (pool_size - 8); x += 8)
1676 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001677 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1678 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001679 vres = vmax_u8(vres, data);
1680 }
1681
1682 // Leftover for loop
1683 for(; x < pool_size; ++x)
1684 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001685 const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001686 res = std::max(res, data);
1687 }
1688 }
1689
1690 // Reduce max
1691 vres = vpmax_u8(vres, vres);
1692 vres = vpmax_u8(vres, vres);
1693 vres = vpmax_u8(vres, vres);
1694
1695 // Get max value
1696 res = std::max(res, vget_lane_u8(vres, 0));
1697 }
1698
1699 // Store result
1700 *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
1701 },
1702 input, output);
1703}
1704
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001705Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1706{
1707 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1708
1709 unsigned int pooled_w = 0;
1710 unsigned int pooled_h = 0;
1711 unsigned int num_elems_processed_per_iteration = 0;
1712 BorderSize border_size(0);
1713
1714 const bool is_global_pooling = pool_info.is_global_pooling();
1715 const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
1716
1717 // Validate pool info befor calling scaled_dimensions
1718 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
1719
1720 // Check output dimensions
1721 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
1722 input->dimension(1),
1723 pool_size,
1724 pool_size,
1725 pool_info.pad_stride_info());
1726
1727 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
1728 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
1729
1730 return Status{};
1731}
1732
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001733void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001734{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001735 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001736 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1737 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1738 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1739
Pablo Tello0c34fe22017-06-26 17:17:42 +01001740 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1741 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Georgios Pinitas55186712018-01-08 17:37:12 +00001742 const unsigned int pool_size = _pool_info.pool_size();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001743
1744 // Set step for input in x and y direction for the input
1745 Window window_input(window);
1746 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001747 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001748 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001749 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001750 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001751 case DataType::F16:
1752 {
1753 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1754 break;
1755 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001756 case DataType::QASYMM8:
1757 {
1758 window_x_inc = pool_stride_x;
1759 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
1760 {
1761 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1762 }
1763 break;
1764 }
Pablo Tello0c34fe22017-06-26 17:17:42 +01001765 case DataType::F32:
1766 {
1767 window_x_inc = pool_stride_x;
1768 break;
1769 }
1770 default:
1771 {
1772 ARM_COMPUTE_ERROR("Not supported");
1773 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001774 }
1775 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1776 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1777
1778 // Run function
1779 (this->*_func)(window_input, window);
1780}