blob: b6af51733af70425b6b5ecef02bf9ca05e6b8ee4 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas55186712018-01-08 17:37:12 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
38
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <algorithm>
42#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010043#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010045#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <string>
47#include <tuple>
48
49using namespace arm_compute;
50
51namespace
52{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000053void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
54{
55 TensorShape output_shape{ input->tensor_shape() };
56 output_shape.set(0, pooled_w);
57 output_shape.set(1, pooled_h);
58
59 auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
60}
61
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000062template <bool exclude_padding>
Isabella Gottardi7567f5f2018-01-30 15:26:00 +000063inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010064 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
65{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000066 int start_x = id.x() * stride_x - pad_x;
67 int start_y = id.y() * stride_y - pad_y;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +000068 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
69 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000070 if(exclude_padding)
71 {
72 start_x = std::max(0, start_x);
73 start_y = std::max(0, start_y);
74 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 return 1.f / ((end_y - start_y) * (end_x - start_x));
76}
77
78inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
79 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
80{
Pablo Tello0c34fe22017-06-26 17:17:42 +010081 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
83 const int start_x = id.x() * stride_x - pad_x;
84 const int start_y = id.y() * stride_y - pad_y;
85 const int end_x = std::min(start_x + pool_size, upper_bound_w);
86 const int end_y = std::min(start_y + pool_size, upper_bound_h);
87 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010088 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
89}
90
91inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
92 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
93{
94 static std::array<qint16_t, 10> scale_values_q16 =
95 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
96 const int start_x = id.x() * stride_x - pad_x;
97 const int start_y = id.y() * stride_y - pad_y;
98 const int end_x = std::min(start_x + pool_size, upper_bound_w);
99 const int end_y = std::min(start_y + pool_size, upper_bound_h);
100 const int val = ((end_y - start_y) * (end_x - start_x));
101 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103
Georgios Pinitas55186712018-01-08 17:37:12 +0000104template <bool exclude_padding>
105inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
106 const int pool_size, const int upper_bound_w, const int upper_bound_h,
107 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
108{
109 int start_x = (id.x() + id_offset) * stride_x - pad_x;
110 int start_y = id.y() * stride_y - pad_y;
111 const int end_y = std::min(start_y + pool_size, upper_bound_h);
112 if(exclude_padding)
113 {
114 start_y = std::max(0, start_y);
115 }
116
117 std::array<uint16_t, 8> elems =
118 {
119 {
120 vgetq_lane_u16(v, 0),
121 vgetq_lane_u16(v, 1),
122 vgetq_lane_u16(v, 2),
123 vgetq_lane_u16(v, 3),
124 vgetq_lane_u16(v, 4),
125 vgetq_lane_u16(v, 5),
126 vgetq_lane_u16(v, 6),
127 vgetq_lane_u16(v, 7),
128 }
129 };
130
131 for(auto &el : elems)
132 {
133 int c_start_x = start_x;
134 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
135 if(exclude_padding)
136 {
137 c_start_x = std::max(0, c_start_x);
138 }
139 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
140 el *= scale;
141 start_x += step * stride_x;
142 }
143
144 v = vsetq_lane_u16(elems[0], v, 0);
145 v = vsetq_lane_u16(elems[1], v, 1);
146 v = vsetq_lane_u16(elems[2], v, 2);
147 v = vsetq_lane_u16(elems[3], v, 3);
148 v = vsetq_lane_u16(elems[4], v, 4);
149 v = vsetq_lane_u16(elems[5], v, 5);
150 v = vsetq_lane_u16(elems[6], v, 6);
151 v = vsetq_lane_u16(elems[7], v, 7);
152}
153
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000154Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000156 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000158 int pool_stride_x = 0;
159 int pool_stride_y = 0;
160 PoolingType pool_type = pool_info.pool_type();
161 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
162 const bool exclude_padding = pool_info.exclude_padding();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100163 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Gian Marco Iodice16824302017-09-28 15:41:37 +0100164 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100165
Georgios Pinitas55186712018-01-08 17:37:12 +0000166 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
167 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000168
169 ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))
170 && (pool_type != PoolingType::MAX));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000171 ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
172 ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000174 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100175 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000176 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
177 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
178 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100179 }
180
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000181 return Status{};
182}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000184Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000185{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000186 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
187 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188
189 return Status{};
190}
191
192std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
193 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000194 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000195{
196 unsigned int num_elems_read_per_iteration = 0;
197 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000198 int pool_stride_x = 0;
199 int pool_stride_y = 0;
200 const int input_width = input->dimension(0);
201 const int input_height = input->dimension(1);
202 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
203 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000204 const int pool_pad_right = pad_stride_info.pad_right();
205 const int pool_pad_top = pad_stride_info.pad_top();
206 const int pool_pad_left = pad_stride_info.pad_left();
207 const int pool_pad_bottom = pad_stride_info.pad_bottom();
208 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000209 // Check output dimensions
210 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
211 input->dimension(1),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000212 pool_size_x,
213 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000214 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100215
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000216 //If it's not squared and optimized will be executed the MxN
217 num_elems_read_per_iteration = 1;
218 num_elems_processed_per_iteration = 1;
219 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100220
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000221 if(is_square)
222 {
223 switch(input->data_type())
224 {
225 case DataType::QS8:
226 num_elems_read_per_iteration = 16;
227 switch(pool_size_x)
228 {
229 case 2:
230 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
231 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
232 break;
233 case 3:
234 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
235 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
236 break;
237 default:
238 break;
239 }
240 break;
241 case DataType::QASYMM8:
242 switch(pool_size_x)
243 {
244 case 2:
245 num_elems_read_per_iteration = 16;
246 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
247 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
248 break;
249 case 3:
250 num_elems_read_per_iteration = 16;
251 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
252 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
253 break;
254 default:
255 break;
256 }
257 break;
258 case DataType::QS16:
259 num_elems_read_per_iteration = 8;
260 switch(pool_size_x)
261 {
262 case 2:
263 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
264 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
265 break;
266 case 3:
267 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
268 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
269 break;
270 default:
271 break;
272 }
273 break;
274#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
275 case DataType::F16:
276 switch(pool_size_x)
277 {
278 case 2:
279 num_elems_read_per_iteration = 16;
280 num_elems_processed_per_iteration = 8;
281 num_elems_horizontal_window = 8;
282 break;
283 case 3:
284 num_elems_read_per_iteration = 4;
285 num_elems_processed_per_iteration = 1;
286 num_elems_horizontal_window = 1;
287 break;
288 default:
289 break;
290 }
291 break;
292#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
293 case DataType::F32:
294 switch(pool_size_x)
295 {
296 case 2:
297 num_elems_read_per_iteration = 2;
298 break;
299 case 3:
300 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
301 break;
302 case 7:
303 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
304 break;
305 default:
306 break;
307 }
308 num_elems_processed_per_iteration = 1;
309 num_elems_horizontal_window = 1;
310 break;
311 default:
312 ARM_COMPUTE_ERROR("Element size not supported");
313 break;
314 }
315 }
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000316 // Number of iterations in X dimension
317 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
318
319 // Upper limit for the number of right/bottom border elements that are accessed
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000320 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000321 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000322
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000323 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
324 border_size.right = std::max(upper_bound_w, pool_pad_right);
325 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000326 bool window_changed = false;
327
328 TensorShape output_shape{ input->tensor_shape() };
329 output_shape.set(0, pooled_w);
330 output_shape.set(1, pooled_h);
331 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
332
333 Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000334 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000335
336 if(output->total_size() != 0)
337 {
338 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
339 window_changed = update_window_and_padding(win, input_access, output_access);
340 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
341 }
342 else
343 {
344 window_changed = update_window_and_padding(win, input_access);
345 }
346
347 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
348 return std::make_pair(err, win);
349}
350} // namespace
351
352NEPoolingLayerKernel::NEPoolingLayerKernel()
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000353 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000354{
355}
356
357BorderSize NEPoolingLayerKernel::border_size() const
358{
359 return _border_size;
360}
361
362void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
363{
364 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
365
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000366 const PoolingType pool_type = pool_info.pool_type();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000367 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
368 const bool exclude_padding = pool_info.exclude_padding();
369 const bool is_global_pooling = pool_info.is_global_pooling();
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000370 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000371
372 // Update pool size in case of global pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000373 const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
374 const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000375
376 // Validate pool info before calling scaled_dimensions
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000377 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000378
379 // Check output dimensions
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000380 unsigned int pooled_w, pooled_h;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000381 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
382 input->info()->dimension(1),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000383 pool_size_x,
384 pool_size_y,
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000385 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000386
387 // Output auto initialization if not yet initialized
388 auto_init(input->info(), output->info(), pooled_w, pooled_h);
389
390 // Perform validation step
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000391 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size_x));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100392
393 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000394 _input = input;
395 _output = output;
396 _pool_info = pool_info;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000397 _is_square = (pool_size_x == pool_size_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100398
Georgios Pinitas55186712018-01-08 17:37:12 +0000399 // Get data type
400 const DataType data_type = input->info()->data_type();
401
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100402 // Select appropriate function
Georgios Pinitas55186712018-01-08 17:37:12 +0000403 if(data_type == DataType::QS8)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100404 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000405 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000406 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000407 switch(pool_size_x)
408 {
409 case 2:
410 switch(pool_type)
411 {
412 case PoolingType::AVG:
413 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
414 break;
415 case PoolingType::MAX:
416 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
417 break;
418 default:
419 ARM_COMPUTE_ERROR("Unsupported pooling type!");
420 }
421 break;
422 case 3:
423 switch(pool_type)
424 {
425 case PoolingType::AVG:
426 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
427 break;
428 case PoolingType::MAX:
429 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
430 break;
431 default:
432 ARM_COMPUTE_ERROR("Unsupported pooling type!");
433 }
434 break;
435 default:
436 switch(pool_type)
437 {
438 case PoolingType::MAX:
439 _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
440 break;
441 default:
442 ARM_COMPUTE_ERROR("Unsupported pooling type!");
443 }
444 break;
445 }
446 }
447 else
448 {
449 switch(pool_type)
450 {
451 case PoolingType::MAX:
452 _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
453 break;
454 default:
455 ARM_COMPUTE_ERROR("Unsupported pooling type!");
456 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000457 }
458 }
459 else if(data_type == DataType::QASYMM8)
460 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000461 if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000462 {
463 switch(pool_type)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100464 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000465 case PoolingType::AVG:
466 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
467 break;
468 case PoolingType::MAX:
469 _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
470 break;
471 default:
472 ARM_COMPUTE_ERROR("Unsupported pooling type!");
473 }
474 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000475 else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000476 {
477 switch(pool_type)
478 {
479 case PoolingType::AVG:
480 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
481 break;
482 case PoolingType::MAX:
483 _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
484 break;
485 default:
486 ARM_COMPUTE_ERROR("Unsupported pooling type!");
487 }
488 }
489 else
490 {
491 switch(pool_type)
492 {
493 case PoolingType::AVG:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000494 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, false>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000495 break;
496 case PoolingType::MAX:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000497 _func = &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::MAX>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000498 break;
499 default:
500 ARM_COMPUTE_ERROR("Unsupported pooling type!");
501 }
502 }
503 }
504 else if(data_type == DataType::QS16)
505 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000506 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000507 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000508 switch(pool_size_x)
509 {
510 case 2:
511 switch(pool_type)
512 {
513 case PoolingType::AVG:
514 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
515 break;
516 case PoolingType::MAX:
517 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
518 break;
519 default:
520 ARM_COMPUTE_ERROR("Unsupported pooling type!");
521 }
522 break;
523 case 3:
524 switch(pool_type)
525 {
526 case PoolingType::AVG:
527 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
528 break;
529 case PoolingType::MAX:
530 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
531 break;
532 default:
533 ARM_COMPUTE_ERROR("Unsupported pooling type!");
534 }
535 break;
536 default:
537 switch(pool_type)
538 {
539 case PoolingType::MAX:
540 _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
541 break;
542 default:
543 ARM_COMPUTE_ERROR("Unsupported pooling type!");
544 }
545 break;
546 }
547 }
548 else
549 {
550 switch(pool_type)
551 {
552 case PoolingType::MAX:
553 _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
554 break;
555 default:
556 ARM_COMPUTE_ERROR("Unsupported pooling type!");
557 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000558 }
559 }
560 else if(data_type == DataType::F16)
561 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000562 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000563 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000564 switch(pool_size_x)
565 {
566 case 2:
567 switch(pool_type)
568 {
569 case PoolingType::AVG:
570 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
571 break;
572 case PoolingType::L2:
573 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
574 break;
575 case PoolingType::MAX:
576 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
577 break;
578 default:
579 ARM_COMPUTE_ERROR("Unsupported pooling type!");
580 }
581 break;
582 case 3:
583 switch(pool_type)
584 {
585 case PoolingType::AVG:
586 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
587 break;
588 case PoolingType::L2:
589 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
590 break;
591 case PoolingType::MAX:
592 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
593 break;
594 default:
595 ARM_COMPUTE_ERROR("Unsupported pooling type!");
596 }
597 break;
598 default:
599 switch(pool_type)
600 {
601 case PoolingType::AVG:
602 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
603 break;
604 case PoolingType::L2:
605 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
606 break;
607 case PoolingType::MAX:
608 _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
609 break;
610 default:
611 ARM_COMPUTE_ERROR("Unsupported pooling type!");
612 }
613 break;
614 }
615 }
616 else
617 {
618 switch(pool_type)
619 {
620 case PoolingType::AVG:
621 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
622 break;
623 case PoolingType::L2:
624 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
625 break;
626 case PoolingType::MAX:
627 _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
628 break;
629 default:
630 ARM_COMPUTE_ERROR("Unsupported pooling type!");
631 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000632 }
633 }
634 else if(data_type == DataType::F32)
635 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000636 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000637 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000638 switch(pool_size_x)
639 {
640 case 2:
641 switch(pool_type)
642 {
643 case PoolingType::AVG:
644 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
645 break;
646 case PoolingType::L2:
647 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
648 break;
649 case PoolingType::MAX:
650 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
651 break;
652 default:
653 ARM_COMPUTE_ERROR("Unsupported pooling type!");
654 }
655 break;
656 case 3:
657 switch(pool_type)
658 {
659 case PoolingType::AVG:
660 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
661 break;
662 case PoolingType::L2:
663 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
664 break;
665 case PoolingType::MAX:
666 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
667 break;
668 default:
669 ARM_COMPUTE_ERROR("Unsupported pooling type!");
670 }
671 break;
672 case 7:
673 switch(pool_type)
674 {
675 case PoolingType::AVG:
676 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
677 break;
678 case PoolingType::L2:
679 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
680 break;
681 case PoolingType::MAX:
682 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
683 break;
684 default:
685 ARM_COMPUTE_ERROR("Unsupported pooling type!");
686 }
687 break;
688 default:
689 switch(pool_type)
690 {
691 case PoolingType::AVG:
692 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
693 break;
694 case PoolingType::L2:
695 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
696 break;
697 case PoolingType::MAX:
698 _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
699 break;
700 default:
701 ARM_COMPUTE_ERROR("Unsupported pooling type!");
702 }
703 break;
704 }
705 }
706 else
707 {
708 switch(pool_type)
709 {
710 case PoolingType::AVG:
711 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
712 break;
713 case PoolingType::L2:
714 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
715 break;
716 case PoolingType::MAX:
717 _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
718 break;
719 default:
720 ARM_COMPUTE_ERROR("Unsupported pooling type!");
721 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000722 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723 }
724
725 // Configure kernel window
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000726 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000727 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
728 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100729}
730
731template <PoolingType pooling_type>
732void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
733{
734 Iterator input(_input, window_input);
735 Iterator output(_output, window);
736
737 const int fixed_point_position = _input->info()->fixed_point_position();
738 constexpr int pool_size = 2;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100739 int pool_stride_x = 0;
740 int pool_stride_y = 0;
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000741 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
742 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
743 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
744 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100745 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000746 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
747 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100748
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000749 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
750 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100751
752 execute_window_loop(window, [&](const Coordinates & id)
753 {
754 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
755 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100756 qint8x8_t lower_res = {};
757 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758 if(pooling_type == PoolingType::AVG)
759 {
760 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000761 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100762 const qint8x8_t scale_vec = vdup_n_qs8(scale);
763
764 // Perform pooling
765 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100766 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
767 if(pool_stride_x == 1)
768 {
769 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
770 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
771 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100772 }
773 else
774 {
775 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100776 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
777 if(pool_stride_x == 1)
778 {
779 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
780 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
781 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100782 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100783 if(pool_stride_x == 1)
784 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100785 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100786 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
787 }
788 else
789 {
790 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
791 }
792 },
793 input, output);
794}
795
Georgios Pinitas55186712018-01-08 17:37:12 +0000796template <PoolingType pooling_type, bool exclude_padding>
797void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
798{
799 Iterator input(_input, window_input);
800 Iterator output(_output, window);
801
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000802 constexpr int pool_size = 2;
803 int pool_stride_x = 0;
804 int pool_stride_y = 0;
805 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
806 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
807 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
808 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Georgios Pinitas55186712018-01-08 17:37:12 +0000809 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000810 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
811 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000812
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000813 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
814 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Georgios Pinitas55186712018-01-08 17:37:12 +0000815
816 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
817
818 execute_window_loop(window, [&](const Coordinates & id)
819 {
820 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
821 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
822 uint8x8_t lower_res = {};
823 uint8x8_t upper_res = {};
824
825 if(pooling_type != PoolingType::MAX)
826 {
827 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
828 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
829
830 // Add rows
831 const uint16x8x2_t vrsum =
832 {
833 {
834 vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
835 vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
836 }
837 };
838
839 // Pair-wise add row data
840 const uint16x4x2_t vpsum =
841 {
842 {
843 vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
844 vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
845 }
846 };
847
848 uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
849
850 // Scale lower result
851 scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
852 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000853 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000854 lower_res = vmovn_u16(res_lower);
855
856 // Compute upper result for stride_x == 1
857 if(pool_stride_x == 1)
858 {
859 // Shifted row sum
860 const uint16x8x2_t vrsum_shifted =
861 {
862 {
863 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
864 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
865 }
866 };
867
868 // Pair-wise add shifted row
869 const uint16x4x2_t vpsum_shifted =
870 {
871 {
872 vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
873 vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
874 }
875 };
876 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
877
878 // Scale lower result
879 scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
880 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000881 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000882 upper_res = vmovn_u16(res_upper);
883 }
884 }
885 else
886 {
887 const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
888 lower_res = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
889 if(pool_stride_x == 1)
890 {
891 const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
892 upper_res = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
893 }
894 }
895
896 // Store result
897 if(pool_stride_x == 1)
898 {
899 const uint8x8x2_t res = { { lower_res, upper_res } };
900 vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
901 }
902 else
903 {
904 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
905 }
906 },
907 input, output);
908}
909
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100910template <PoolingType pooling_type>
911void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
912{
913 Iterator input(_input, window_input);
914 Iterator output(_output, window);
915
916 const int fixed_point_position = _input->info()->fixed_point_position();
917 constexpr int pool_size = 2;
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000918 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
919 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
920 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
921 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100922 int pool_stride_x = 0;
923 int pool_stride_y = 0;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100924 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000925 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
926 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100927
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000928 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
929 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100930
931 execute_window_loop(window, [&](const Coordinates & id)
932 {
933 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
934 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
935 qint16x4_t lower_res = {};
936 qint16x4_t upper_res = {};
937 if(pooling_type == PoolingType::AVG)
938 {
939 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000940 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100941 const qint16x4_t scale_vec = vdup_n_qs16(scale);
942
943 // Perform pooling
944 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
945 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
946 if(pool_stride_x == 1)
947 {
948 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
949 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
950 }
951 }
952 else
953 {
954 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
955 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
956 if(pool_stride_x == 1)
957 {
958 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
959 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
960 }
961 }
962 if(pool_stride_x == 1)
963 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100964 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100965 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
966 }
967 else
968 {
969 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
970 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100971 },
972 input, output);
973}
974
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000975template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100976void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
977{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000978#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100979 Iterator input(_input, window_input);
980 Iterator output(_output, window);
981
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000982 constexpr const int pool_size = 3;
983 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
984 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
985 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
986 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
987 int pool_stride_x = 0;
988 int pool_stride_y = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100989 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000990 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
991 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100992
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000993 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
994 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
995 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100996
997 execute_window_loop(window, [&](const Coordinates & id)
998 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100999 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
1000 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
1001 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
1002 float16x4_t res = {};
1003
1004 // Get power of 2 in case of l2 pooling
1005 if(pooling_type == PoolingType::L2)
1006 {
1007 top_data = vmul_f16(top_data, top_data);
1008 middle_data = vmul_f16(middle_data, middle_data);
1009 bottom_data = vmul_f16(bottom_data, bottom_data);
1010 }
1011
1012 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +01001013 {
1014 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001015 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +01001016 const float16x4_t scale_v = vdup_n_f16(scale);
1017 // Perform pooling
1018 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
1019 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
1020 res = vmul_f16(vpadd_f16(res, res), scale_v);
1021 }
1022 else
1023 {
1024 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
1025 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
1026 res = vpmax_f16(res, res);
1027 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001028
1029 // Calculate square-root in case of l2 pooling
1030 if(pooling_type == PoolingType::L2)
1031 {
1032 res = vinv_f16(vinvsqrt_f16(res));
1033 }
1034
Pablo Tello0c34fe22017-06-26 17:17:42 +01001035 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
1036 },
1037 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001038#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001039 ARM_COMPUTE_UNUSED(window_input);
1040 ARM_COMPUTE_UNUSED(window);
1041 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001042#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001043}
1044
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001045template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +01001046void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
1047{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001048#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +01001049 Iterator input(_input, window_input);
1050 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001051 constexpr int pool_size = 2;
1052 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1053 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1054 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1055 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1056 int pool_stride_x, pool_stride_y = 0;
1057 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1058 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1059 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +01001060
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001061 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1062 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +01001063
1064 execute_window_loop(window, [&](const Coordinates & id)
1065 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001066 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
1067 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +01001068 float16x8_t res = {};
1069
Georgios Pinitascdf51452017-08-31 14:21:36 +01001070 // Get power of 2 in case of l2 pooling
1071 if(pooling_type == PoolingType::L2)
1072 {
1073 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
1074 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
1075 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
1076 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
1077 }
1078
1079 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +01001080 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001081 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +01001082 const float16x8_t scale_v = vdupq_n_f16(scale);
1083 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
1084 }
1085 else
1086 {
1087 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
1088 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001089
1090 // Calculate square-root in case of l2 pooling
1091 if(pooling_type == PoolingType::L2)
1092 {
1093 res = vinvq_f16(vinvsqrtq_f16(res));
1094 }
1095
1096 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +01001097 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
1098 },
1099 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001100#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001101 ARM_COMPUTE_UNUSED(window_input);
1102 ARM_COMPUTE_UNUSED(window);
1103 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001104#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001105}
1106
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001107template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001108void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
1109{
1110 Iterator input(_input, window_input);
1111 Iterator output(_output, window);
1112
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001113 constexpr int pool_size = 2;
1114 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1115 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1116 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1117 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1118 int pool_stride_x = 0;
1119 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001120 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001121 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1122 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001123
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001124 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1125 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001126
1127 execute_window_loop(window, [&](const Coordinates & id)
1128 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001129 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1130 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1131 float32x2_t res = {};
1132 float final_res = 0;
1133
1134 // Get power of 2 in case of l2 pooling
1135 if(pooling_type == PoolingType::L2)
1136 {
1137 top_data = vmul_f32(top_data, top_data);
1138 bottom_data = vmul_f32(bottom_data, bottom_data);
1139 }
1140
1141 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001142 {
1143 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001144 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001145 const float32x2_t scale_v = vdup_n_f32(scale);
1146
1147 // Perform pooling
1148 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1149 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1150 }
1151 else
1152 {
1153 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1154 res = vpmax_f32(max_data, max_data);
1155 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001156 final_res = vget_lane_f32(res, 0);
1157
1158 // Calculate square-root in case of l2 pooling
1159 if(pooling_type == PoolingType::L2)
1160 {
1161 final_res = sqrt(final_res);
1162 }
1163
1164 // Store result
1165 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001166 },
1167 input, output);
1168}
1169
1170template <PoolingType pooling_type>
1171void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
1172{
1173 Iterator input(_input, window_input);
1174 Iterator output(_output, window);
1175
1176 const int fixed_point_position = _input->info()->fixed_point_position();
1177 constexpr int pool_size = 3;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001178 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1179 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1180 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1181 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001182 int pool_stride_x = 0;
1183 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001184 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001185 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
1186 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001187
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001188 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1189 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1190 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001191
1192 execute_window_loop(window, [&](const Coordinates & id)
1193 {
1194 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
1195 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
1196 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
1197 qint8x8_t res = {};
1198 if(pooling_type == PoolingType::AVG)
1199 {
1200 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001201 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001202
1203 // Perform pooling for stride 2
1204 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
1205 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
1206 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
1207 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
1208 if(pool_stride_x == 2)
1209 {
1210 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
1211 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001212 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001213 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001214 res = vqmul_qs8(res, scale_vec, fixed_point_position);
1215 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001216 }
1217 else
1218 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001219 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
1220 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001221 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001222 }
1223 else
1224 {
1225 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
1226 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
1227 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
1228 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
1229
1230 if(pool_stride_x == 2)
1231 {
1232 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
1233 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1234 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001235 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001236 }
1237 else
1238 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001239 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001240 }
1241 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001242 },
1243 input, output);
1244}
1245
Georgios Pinitas55186712018-01-08 17:37:12 +00001246template <PoolingType pooling_type, bool exclude_padding>
1247void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
1248{
1249 Iterator input(_input, window_input);
1250 Iterator output(_output, window);
1251
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001252 constexpr int pool_size = 3;
1253 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1254 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1255 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1256 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1257 int pool_stride_x = 0;
1258 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001259 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001260 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1261 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001262
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001263 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1264 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1265 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Georgios Pinitas55186712018-01-08 17:37:12 +00001266
1267 execute_window_loop(window, [&](const Coordinates & id)
1268 {
1269 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
1270 const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
1271 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
1272
1273 if(pooling_type == PoolingType::AVG)
1274 {
1275 // Convert data to u16
1276 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
1277 const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
1278 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
1279
1280 // Calculate row sums
1281 const uint16x8x2_t vrsum =
1282 {
1283 {
1284 vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
1285 vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
1286 }
1287 };
1288 const uint16x8x2_t vrsum_shifted_1 =
1289 {
1290 {
1291 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
1292 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
1293 }
1294 };
1295 const uint16x8x2_t vrsum_shifted_2 =
1296 {
1297 {
1298 vextq_u16(vrsum.val[0], vrsum.val[1], 2),
1299 vextq_u16(vrsum.val[1], vrsum.val[1], 2)
1300 }
1301 };
1302 // Calculate final sum
1303 uint16x8x2_t final_sum =
1304 {
1305 {
1306 vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1307 vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1308 }
1309 };
1310 if(pool_stride_x == 2)
1311 {
1312 uint16x8_t res =
1313 {
1314 vgetq_lane_u16(final_sum.val[0], 0),
1315 vgetq_lane_u16(final_sum.val[0], 2),
1316 vgetq_lane_u16(final_sum.val[0], 4),
1317 vgetq_lane_u16(final_sum.val[0], 6),
1318 vgetq_lane_u16(final_sum.val[1], 0),
1319 vgetq_lane_u16(final_sum.val[1], 2),
1320 vgetq_lane_u16(final_sum.val[1], 4),
1321 vgetq_lane_u16(final_sum.val[1], 6),
1322 };
1323
1324 scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
1325 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001326 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001327 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
1328 }
1329 else
1330 {
1331 // Scale lower result
1332 scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
1333 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001334 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001335 // Scale lower result
1336 scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
1337 pool_size, upper_bound_w, upper_bound_h,
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001338 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001339 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
1340 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1341 }
1342 }
1343 else
1344 {
1345 const uint8x16_t max_data = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
1346 const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
1347 const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
1348 const uint8x16_t final_max = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
1349
1350 if(pool_stride_x == 2)
1351 {
1352 const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
1353 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1354 const uint8x8_t res = vtbl2_u8(table, lookup_val);
1355 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1356 }
1357 else
1358 {
1359 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
1360 }
1361 }
1362 },
1363 input, output);
1364}
1365
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001366template <PoolingType pooling_type>
1367void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
1368{
1369 Iterator input(_input, window_input);
1370 Iterator output(_output, window);
1371
1372 const int fixed_point_position = _input->info()->fixed_point_position();
1373 constexpr int pool_size = 3;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001374 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1375 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1376 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1377 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001378 int pool_stride_x = 0;
1379 int pool_stride_y = 0;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001380 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001381 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
1382 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001383
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001384 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1385 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1386 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001387
1388 execute_window_loop(window, [&](const Coordinates & id)
1389 {
1390 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
1391 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
1392 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
1393
1394 if(pooling_type == PoolingType::AVG)
1395 {
1396 // Calculate scale
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001397 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001398
1399 // Perform pooling for stride 2
1400 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
1401 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
1402 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
1403 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
1404 if(pool_stride_x == 2)
1405 {
1406 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
1407 const qint16x4_t scale_vec = vdup_n_qs16(scale);
1408 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
1409 }
1410 else
1411 {
1412 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
1413 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
1414 }
1415 }
1416 else
1417 {
1418 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
1419 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
1420 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
1421 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
1422
1423 if(pool_stride_x == 2)
1424 {
1425 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
1426 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
1427 }
1428 else
1429 {
1430 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
1431 }
1432 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001433 },
1434 input, output);
1435}
1436
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001437template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001438void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
1439{
1440 Iterator input(_input, window_input);
1441 Iterator output(_output, window);
1442
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001443 constexpr const int pool_size = 3;
1444 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1445 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1446 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1447 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1448 int pool_stride_x = 0;
1449 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001450 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001451 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1452 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001453
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001454 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1455 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1456 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001457
1458 execute_window_loop(window, [&](const Coordinates & id)
1459 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001460 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1461 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1462 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1463 float32x2_t res = {};
1464 float final_res = 0;
1465
1466 // Get power of 2 in case of l2 pooling
1467 if(pooling_type == PoolingType::L2)
1468 {
1469 top_data = vmulq_f32(top_data, top_data);
1470 middle_data = vmulq_f32(middle_data, middle_data);
1471 bottom_data = vmulq_f32(bottom_data, bottom_data);
1472 }
1473
1474 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001475 {
1476 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001477 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001478 const float32x2_t scale_v = vdup_n_f32(scale);
1479
1480 // Perform pooling
1481 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1482 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1483 res = vmul_f32(vpadd_f32(res, res), scale_v);
1484 }
1485 else
1486 {
1487 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1488 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1489 res = vpmax_f32(res, res);
1490 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001491 final_res = vget_lane_f32(res, 0);
1492
1493 // Calculate square-root in case of l2 pooling
1494 if(pooling_type == PoolingType::L2)
1495 {
1496 final_res = sqrt(final_res);
1497 }
1498
1499 // Store result
1500 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001501 },
1502 input, output);
1503}
1504
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001505template <PoolingType pooling_type, bool exclude_padding>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001506void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
1507{
1508 Iterator input(_input, window_input);
1509 Iterator output(_output, window);
1510
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001511 constexpr const int pool_size = 7;
1512 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1513 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1514 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1515 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1516 int pool_stride_x = 0;
1517 int pool_stride_y = 0;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001518 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001519 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1520 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001521
1522 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1523 for(int i = 0; i < pool_size; ++i)
1524 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001525 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001526 }
1527
1528 execute_window_loop(window, [&](const Coordinates & id)
1529 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001530 float32x2_t res = {};
1531 float final_res = 0.f;
1532 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001533 {
1534 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001535 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001536 const float32x2_t scale_v = vdup_n_f32(scale);
1537
1538 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +01001539 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1540 // Get power of 2 in case of l2 pooling
1541 if(pooling_type == PoolingType::L2)
1542 {
1543 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1544 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1545 }
1546 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001547 for(int i = 1; i < pool_size; ++i)
1548 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001549 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1550 // Get power of 2 in case of l2 pooling
1551 if(pooling_type == PoolingType::L2)
1552 {
1553 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1554 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1555 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001556 sum_data = vaddq_f32(sum_data, data.val[0]);
1557 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1558 }
1559 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1560 res = vmul_f32(vpadd_f32(res, res), scale_v);
1561 }
1562 else
1563 {
1564 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1565 for(int i = 1; i < pool_size; ++i)
1566 {
1567 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1568 max_data = vmax2q_f32(max_data, data);
1569 }
1570 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1571 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1572 res = vpmax_f32(res, res);
1573 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001574 final_res = vget_lane_f32(res, 0);
1575
1576 // Calculate square-root in case of l2 pooling
1577 if(pooling_type == PoolingType::L2)
1578 {
1579 final_res = sqrt(final_res);
1580 }
1581
1582 // Store result
1583 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001584 },
1585 input, output);
1586}
1587
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001588template <PoolingType pooling_type>
1589void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001590{
1591 Iterator input(_input, window_input);
1592 Iterator output(_output, window);
1593
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001594 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1595 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
1596 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1597 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1598 int pool_stride_x = 0;
1599 int pool_stride_y = 0;
1600 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1601
1602 execute_window_loop(window, [&](const Coordinates & id)
1603 {
1604 qint8x16_t vres = {};
1605 qint8_t res = {};
1606
1607 //PoolingType::MAX
1608 for(int y = 0; y < pool_size_y; ++y)
1609 {
1610 int x = 0;
1611 for(; x <= (pool_size_x - 16); x += 16)
1612 {
1613 const qint8x16_t data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1614 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1615 vres = vmaxq_s8(vres, data);
1616 }
1617
1618 // Leftover for loop
1619 for(; x < pool_size_x; ++x)
1620 {
1621 qint8_t data = *(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1622 res = std::max(res, data);
1623 }
1624 }
1625 //Reduce
1626 const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres));
1627 res = std::max(res, vget_lane_s8(half_vres, 0));
1628 res = std::max(res, vget_lane_s8(half_vres, 1));
1629 res = std::max(res, vget_lane_s8(half_vres, 2));
1630 res = std::max(res, vget_lane_s8(half_vres, 3));
1631 res = std::max(res, vget_lane_s8(half_vres, 4));
1632 res = std::max(res, vget_lane_s8(half_vres, 5));
1633 res = std::max(res, vget_lane_s8(half_vres, 6));
1634 res = std::max(res, vget_lane_s8(half_vres, 7));
1635
1636 // Store result
1637 *(reinterpret_cast<qint8_t *>(output.ptr())) = res;
1638 },
1639 input, output);
1640}
1641
1642template <PoolingType pooling_type>
1643void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window)
1644{
1645 Iterator input(_input, window_input);
1646 Iterator output(_output, window);
1647
1648 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1649 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
1650 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1651 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1652 int pool_stride_x = 0;
1653 int pool_stride_y = 0;
1654 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1655
1656 execute_window_loop(window, [&](const Coordinates & id)
1657 {
1658 qint16x8_t vres = {};
1659 qint16_t res = {};
1660
1661 //PoolingType::MAX
1662 for(int y = 0; y < pool_size_y; ++y)
1663 {
1664 int x = 0;
1665 for(; x <= (pool_size_x - 8); x += 8)
1666 {
1667 const qint16x8_t data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1668 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1669 vres = vmaxq_s16(vres, data);
1670 }
1671
1672 // Leftover for loop
1673 for(; x < pool_size_x; ++x)
1674 {
1675 qint16_t data = *(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1676 res = std::max(res, data);
1677 }
1678 }
1679 //Reduce
1680 const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres));
1681 res = std::max(res, vget_lane_s16(half_vres, 0));
1682 res = std::max(res, vget_lane_s16(half_vres, 1));
1683 res = std::max(res, vget_lane_s16(half_vres, 2));
1684 res = std::max(res, vget_lane_s16(half_vres, 3));
1685
1686 // Store result
1687 *(reinterpret_cast<qint16_t *>(output.ptr())) = res;
1688 },
1689 input, output);
1690}
1691
1692template <PoolingType pooling_type, bool exclude_padding>
1693void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window)
1694{
1695#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1696 Iterator input(_input, window_input);
1697 Iterator output(_output, window);
1698
1699 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1700 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
1701 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1702 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1703 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1704 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1705 int pool_stride_x = 0;
1706 int pool_stride_y = 0;
1707 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1708 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1709 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1710
1711 execute_window_loop(window, [&](const Coordinates & id)
1712 {
1713 float16_t res = 0.0f;
1714 float16x8_t vres = vdupq_n_f16(0.0f);
1715
1716 if(pooling_type != PoolingType::MAX)
1717 {
1718 // Calculate scale
1719 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1720
1721 // Perform pooling
1722
1723 for(int y = 0; y < pool_size_y; ++y)
1724 {
1725 int x = 0;
1726 for(; x <= (pool_size_x - 8); x += 8)
1727 {
1728 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1729 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1730
1731 // Get power of 2 in case of l2 pooling and accumulate
1732 if(pooling_type == PoolingType::L2)
1733 {
1734 vres = vaddq_f16(vres, vmulq_f16(data, data));
1735 }
1736 else
1737 {
1738 vres = vaddq_f16(vres, data);
1739 }
1740 }
1741
1742 // Leftover for loop
1743 for(; x < pool_size_x; ++x)
1744 {
1745 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1746
1747 // Get power of 2 in case of l2 pooling
1748 if(pooling_type == PoolingType::L2)
1749 {
1750 data *= data;
1751 }
1752
1753 res += data;
1754 }
1755 }
1756
1757 // Reduction
1758 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1759 res += vget_lane_f16(tmp, 0);
1760 res += vget_lane_f16(tmp, 1);
1761 res += vget_lane_f16(tmp, 2);
1762 res += vget_lane_f16(tmp, 3);
1763
1764 // Divide by scale
1765 res *= scale;
1766 }
1767 else
1768 {
1769 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1770 res = std::numeric_limits<float>::lowest();
1771
1772 for(int y = 0; y < pool_size_y; ++y)
1773 {
1774 int x = 0;
1775 for(; x <= (pool_size_x - 8); x += 8)
1776 {
1777 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1778 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1779 vres = vmaxq_f16(vres, data);
1780 }
1781
1782 // Leftover for loop
1783 for(; x < pool_size_x; ++x)
1784 {
1785 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1786 res = std::max(res, data);
1787 }
1788 }
1789
1790 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1791 res = std::max(res, vget_lane_f16(tmp, 0));
1792 res = std::max(res, vget_lane_f16(tmp, 1));
1793 res = std::max(res, vget_lane_f16(tmp, 2));
1794 res = std::max(res, vget_lane_f16(tmp, 3));
1795 }
1796
1797 // Calculate square-root in case of l2 pooling
1798 if(pooling_type == PoolingType::L2)
1799 {
1800 res = std::sqrt(res);
1801 }
1802
1803 // Store result
1804 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1805 },
1806 input, output);
1807
1808#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1809 ARM_COMPUTE_UNUSED(window_input);
1810 ARM_COMPUTE_UNUSED(window);
1811 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1812#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1813}
1814
1815template <PoolingType pooling_type, bool exclude_padding>
1816void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window)
1817{
1818 Iterator input(_input, window_input);
1819 Iterator output(_output, window);
1820
1821 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1822 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001823 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1824 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1825 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1826 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1827 int pool_stride_x = 0;
1828 int pool_stride_y = 0;
Gian Marco Iodice16824302017-09-28 15:41:37 +01001829 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001830 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1831 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001832
1833 execute_window_loop(window, [&](const Coordinates & id)
1834 {
1835 float res = 0.0f;
1836
1837 if(pooling_type != PoolingType::MAX)
1838 {
1839 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001840 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001841
1842 // Perform pooling
1843 float32x4_t vres = vdupq_n_f32(0.0f);
1844
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001845 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001846 {
1847 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001848 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001849 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001850 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1851 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001852
1853 // Get power of 2 in case of l2 pooling and accumulate
1854 if(pooling_type == PoolingType::L2)
1855 {
1856 vres = vmlaq_f32(vres, data, data);
1857 }
1858 else
1859 {
1860 vres = vaddq_f32(vres, data);
1861 }
1862 }
1863
1864 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001865 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001866 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001867 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001868
1869 // Get power of 2 in case of l2 pooling
1870 if(pooling_type == PoolingType::L2)
1871 {
1872 data *= data;
1873 }
1874
1875 res += data;
1876 }
1877 }
1878
1879#if defined(__aarch64__)
1880 // Reduction operation available on 64 bit architectures only
1881 res += vaddvq_f32(vres);
1882#else // __aarch64__
1883 // Reduction
1884 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1885 tmp = vpadd_f32(tmp, tmp);
1886
1887 res += vget_lane_f32(tmp, 0);
1888#endif // __aarch64__
1889 // Divide by scale
1890 res *= scale;
1891 }
1892 else
1893 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001894 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1895 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001896
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001897 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001898 {
1899 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001900 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001901 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001902 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1903 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001904 vres = vmaxq_f32(vres, data);
1905 }
1906
1907 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001908 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001909 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001910 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001911 res = std::max(res, data);
1912 }
1913 }
1914
1915#if defined(__aarch64__)
1916 // Reduction operation available on 64 bit architectures only
1917 res = std::max(vmaxvq_f32(vres), res);
1918#else // __aarch64__
1919 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1920 tmp = vpmax_f32(tmp, tmp);
1921
1922 res = std::max(res, vget_lane_f32(tmp, 0));
1923#endif // __aarch64__
1924 }
1925
1926 // Calculate square-root in case of l2 pooling
1927 if(pooling_type == PoolingType::L2)
1928 {
1929 res = std::sqrt(res);
1930 }
1931
1932 // Store result
1933 *(reinterpret_cast<float *>(output.ptr())) = res;
1934 },
1935 input, output);
1936}
1937
Georgios Pinitas55186712018-01-08 17:37:12 +00001938template <PoolingType pooling_type, bool exclude_padding>
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001939void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window)
Georgios Pinitas55186712018-01-08 17:37:12 +00001940{
1941 Iterator input(_input, window_input);
1942 Iterator output(_output, window);
1943
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001944 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1945 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001946 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1947 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1948 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1949 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1950 int pool_stride_x = 0;
1951 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001952 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001953 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1954 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001955
1956 execute_window_loop(window, [&](const Coordinates & id)
1957 {
1958 uint8_t res = 0;
1959
1960 if(pooling_type != PoolingType::MAX)
1961 {
1962 uint32x4_t vres = vdupq_n_u32(0);
1963 uint32_t sres = 0;
1964
1965 // Calculate scale
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001966 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001967
1968 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001969 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001970 {
1971 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001972 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001973 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001974 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1975 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001976
1977 const uint16x8_t data_u16 = vmovl_u8(data);
1978 vres = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
1979 }
1980
1981 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001982 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001983 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001984 uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001985 sres += data;
1986 }
1987 }
1988
1989 // Reduction
1990 const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
1991 sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
1992
1993 // Divide by scale
1994 res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
1995 }
1996 else
1997 {
1998 uint8x8_t vres = vdup_n_u8(0);
1999 res = 0;
2000
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002001 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002002 {
2003 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002004 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002005 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002006 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
2007 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00002008 vres = vmax_u8(vres, data);
2009 }
2010
2011 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002012 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002013 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002014 const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00002015 res = std::max(res, data);
2016 }
2017 }
2018
2019 // Reduce max
2020 vres = vpmax_u8(vres, vres);
2021 vres = vpmax_u8(vres, vres);
2022 vres = vpmax_u8(vres, vres);
2023
2024 // Get max value
2025 res = std::max(res, vget_lane_u8(vres, 0));
2026 }
2027
2028 // Store result
2029 *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
2030 },
2031 input, output);
2032}
2033
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002034Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
2035{
2036 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2037
2038 unsigned int pooled_w = 0;
2039 unsigned int pooled_h = 0;
2040 unsigned int num_elems_processed_per_iteration = 0;
2041 BorderSize border_size(0);
2042
2043 const bool is_global_pooling = pool_info.is_global_pooling();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002044 const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
2045 const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002046
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002047 // Validate pool info before calling scaled_dimensions
2048 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002049
2050 // Check output dimensions
2051 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
2052 input->dimension(1),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002053 pool_size_x,
2054 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002055 pool_info.pad_stride_info());
2056
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002057 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size_x));
2058 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
2059 pool_size_x, pool_size_y)
2060 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002061
2062 return Status{};
2063}
2064
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002065void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002066{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002067 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002068 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2069 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2070 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2071
Pablo Tello0c34fe22017-06-26 17:17:42 +01002072 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
2073 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Isabella Gottardi6e464c32018-01-26 12:32:45 +00002074 const unsigned int pool_size = _pool_info.pool_size().width;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002075
2076 // Set step for input in x and y direction for the input
2077 Window window_input(window);
2078 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01002079 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002080 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01002081 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01002082 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01002083 case DataType::F16:
2084 {
2085 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2086 break;
2087 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002088 case DataType::QASYMM8:
2089 {
2090 window_x_inc = pool_stride_x;
2091 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2092 {
2093 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2094 }
2095 break;
2096 }
Pablo Tello0c34fe22017-06-26 17:17:42 +01002097 case DataType::F32:
2098 {
2099 window_x_inc = pool_stride_x;
2100 break;
2101 }
2102 default:
2103 {
2104 ARM_COMPUTE_ERROR("Not supported");
2105 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002106 }
2107 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2108 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
2109
2110 // Run function
2111 (this->*_func)(window_input, window);
2112}