blob: ff4802c5e02c7360be47be2a65b55f2db12f8050 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas55186712018-01-08 17:37:12 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/FixedPoint.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
38
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041#include <algorithm>
42#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010043#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010045#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <string>
47#include <tuple>
48
49using namespace arm_compute;
50
51namespace
52{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000053void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
54{
55 TensorShape output_shape{ input->tensor_shape() };
56 output_shape.set(0, pooled_w);
57 output_shape.set(1, pooled_h);
58
59 auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
60}
61
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000062template <bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010063inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
64 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
65{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000066 int start_x = id.x() * stride_x - pad_x;
67 int start_y = id.y() * stride_y - pad_y;
Pablo Tello0c34fe22017-06-26 17:17:42 +010068 const int end_x = std::min(start_x + pool_size, upper_bound_w);
69 const int end_y = std::min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000070 if(exclude_padding)
71 {
72 start_x = std::max(0, start_x);
73 start_y = std::max(0, start_y);
74 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 return 1.f / ((end_y - start_y) * (end_x - start_x));
76}
77
78inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
79 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
80{
Pablo Tello0c34fe22017-06-26 17:17:42 +010081 static const std::array<qint8_t, 10> scale_values_q8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082 { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
83 const int start_x = id.x() * stride_x - pad_x;
84 const int start_y = id.y() * stride_y - pad_y;
85 const int end_x = std::min(start_x + pool_size, upper_bound_w);
86 const int end_y = std::min(start_y + pool_size, upper_bound_h);
87 const int val = ((end_y - start_y) * (end_x - start_x));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010088 return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
89}
90
91inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
92 int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
93{
94 static std::array<qint16_t, 10> scale_values_q16 =
95 { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
96 const int start_x = id.x() * stride_x - pad_x;
97 const int start_y = id.y() * stride_y - pad_y;
98 const int end_x = std::min(start_x + pool_size, upper_bound_w);
99 const int end_y = std::min(start_y + pool_size, upper_bound_h);
100 const int val = ((end_y - start_y) * (end_x - start_x));
101 return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103
Georgios Pinitas55186712018-01-08 17:37:12 +0000104template <bool exclude_padding>
105inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
106 const int pool_size, const int upper_bound_w, const int upper_bound_h,
107 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
108{
109 int start_x = (id.x() + id_offset) * stride_x - pad_x;
110 int start_y = id.y() * stride_y - pad_y;
111 const int end_y = std::min(start_y + pool_size, upper_bound_h);
112 if(exclude_padding)
113 {
114 start_y = std::max(0, start_y);
115 }
116
117 std::array<uint16_t, 8> elems =
118 {
119 {
120 vgetq_lane_u16(v, 0),
121 vgetq_lane_u16(v, 1),
122 vgetq_lane_u16(v, 2),
123 vgetq_lane_u16(v, 3),
124 vgetq_lane_u16(v, 4),
125 vgetq_lane_u16(v, 5),
126 vgetq_lane_u16(v, 6),
127 vgetq_lane_u16(v, 7),
128 }
129 };
130
131 for(auto &el : elems)
132 {
133 int c_start_x = start_x;
134 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
135 if(exclude_padding)
136 {
137 c_start_x = std::max(0, c_start_x);
138 }
139 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
140 el *= scale;
141 start_x += step * stride_x;
142 }
143
144 v = vsetq_lane_u16(elems[0], v, 0);
145 v = vsetq_lane_u16(elems[1], v, 1);
146 v = vsetq_lane_u16(elems[2], v, 2);
147 v = vsetq_lane_u16(elems[3], v, 3);
148 v = vsetq_lane_u16(elems[4], v, 4);
149 v = vsetq_lane_u16(elems[5], v, 5);
150 v = vsetq_lane_u16(elems[6], v, 6);
151 v = vsetq_lane_u16(elems[7], v, 7);
152}
153
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000154Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000156 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000158 int pool_pad_x = 0;
159 int pool_pad_y = 0;
160 int pool_stride_x = 0;
161 int pool_stride_y = 0;
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000162 PoolingType pool_type = pool_info.pool_type();
Georgios Pinitas4c2dd542017-11-13 12:58:41 +0000163 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
164 const bool exclude_padding = pool_info.exclude_padding();
165 const bool is_global_pooling = pool_info.is_global_pooling();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100166 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
167 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Gian Marco Iodice16824302017-09-28 15:41:37 +0100168 static const std::set<int> supported_pool_sizes = { 2, 3 };
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100169
Georgios Pinitas55186712018-01-08 17:37:12 +0000170 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
171 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
172 ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000173 ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
174 ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
175 ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
176 ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100177
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000178 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100179 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
181 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
182 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100183 }
184
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000185 return Status{};
186}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100187
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
189{
190 const bool is_global_pooling = pool_info.is_global_pooling();
191 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
192 "Global pooling is supported only with rectangular inputs!");
193 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
194 "Invalid pool size and pool pad combination!");
195
196 return Status{};
197}
198
199std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
200 BorderSize &border_size,
201 unsigned int pooled_w, unsigned int pooled_h, int pool_size)
202{
203 unsigned int num_elems_read_per_iteration = 0;
204 unsigned int num_elems_horizontal_window = 0;
205 int pool_pad_x = 0;
206 int pool_pad_y = 0;
207 int pool_stride_x = 0;
208 int pool_stride_y = 0;
209 const int input_width = input->dimension(0);
210 const int input_height = input->dimension(1);
211 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
212 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
213 std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
214
215 // Check output dimensions
216 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
217 input->dimension(1),
218 pool_size,
219 pool_size,
220 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100221
222 // Select element size
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000223 switch(input->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224 {
225 case DataType::QS8:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100226 num_elems_read_per_iteration = 16;
227 switch(pool_size)
228 {
229 case 2:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100230 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100231 break;
232 case 3:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100233 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100234 break;
235 default:
236 ARM_COMPUTE_ERROR("Pooling size not supported");
Pablo Tello0c34fe22017-06-26 17:17:42 +0100237 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100238 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100239 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
240 break;
Georgios Pinitas55186712018-01-08 17:37:12 +0000241 case DataType::QASYMM8:
242 switch(pool_size)
243 {
244 case 2:
245 num_elems_read_per_iteration = 16;
246 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
247 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
248 break;
249 case 3:
250 num_elems_read_per_iteration = 16;
251 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
252 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
253 break;
254 default:
255 num_elems_read_per_iteration = 1;
256 num_elems_processed_per_iteration = 1;
257 num_elems_horizontal_window = 1;
258 break;
259 }
260 break;
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100261 case DataType::QS16:
262 num_elems_read_per_iteration = 8;
263 switch(pool_size)
264 {
265 case 2:
266 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
267 break;
268 case 3:
269 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
270 break;
271 default:
272 ARM_COMPUTE_ERROR("Pooling size not supported");
273 }
274 num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000276#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100277 case DataType::F16:
278 switch(pool_size)
279 {
280 case 2:
281 num_elems_read_per_iteration = 16;
282 num_elems_processed_per_iteration = 8;
283 num_elems_horizontal_window = 8;
284 break;
285 case 3:
286 num_elems_read_per_iteration = 4;
287 num_elems_processed_per_iteration = 1;
288 num_elems_horizontal_window = 1;
289 break;
290 default:
291 ARM_COMPUTE_ERROR("Pooling size not supported");
292 break;
293 }
294 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000295#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100296 case DataType::F32:
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100297 switch(pool_size)
298 {
299 case 2:
300 num_elems_read_per_iteration = 2;
301 break;
302 case 3:
303 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
304 break;
305 case 7:
306 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
307 break;
308 default:
Gian Marco Iodice16824302017-09-28 15:41:37 +0100309 num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
Pablo Tello0c34fe22017-06-26 17:17:42 +0100310 break;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100311 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100312 num_elems_processed_per_iteration = 1;
313 num_elems_horizontal_window = 1;
314 break;
315 default:
316 ARM_COMPUTE_ERROR("Element size not supported");
317 break;
318 }
319
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000320 // Number of iterations in X dimension
321 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
322
323 // Upper limit for the number of right/bottom border elements that are accessed
324 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000325 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
326
327 border_size = BorderSize(pool_pad_y, pool_pad_x);
328 border_size.right = std::max(upper_bound_w, pool_pad_x);
329 border_size.bottom = std::max(upper_bound_h, pool_pad_y);
330 bool window_changed = false;
331
332 TensorShape output_shape{ input->tensor_shape() };
333 output_shape.set(0, pooled_w);
334 output_shape.set(1, pooled_h);
335 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
336
337 Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
338 AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
339
340 if(output->total_size() != 0)
341 {
342 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
343 window_changed = update_window_and_padding(win, input_access, output_access);
344 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
345 }
346 else
347 {
348 window_changed = update_window_and_padding(win, input_access);
349 }
350
351 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
352 return std::make_pair(err, win);
353}
354} // namespace
355
356NEPoolingLayerKernel::NEPoolingLayerKernel()
357 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
358{
359}
360
361BorderSize NEPoolingLayerKernel::border_size() const
362{
363 return _border_size;
364}
365
366void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
367{
368 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
369
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000370 const PoolingType pool_type = pool_info.pool_type();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000371 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
372 const bool exclude_padding = pool_info.exclude_padding();
373 const bool is_global_pooling = pool_info.is_global_pooling();
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000374 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000375
376 // Update pool size in case of global pooling
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000377 const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000378
379 // Validate pool info before calling scaled_dimensions
380 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
381
382 // Check output dimensions
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000383 unsigned int pooled_w, pooled_h;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000384 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
385 input->info()->dimension(1),
386 pool_size,
387 pool_size,
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000388 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000389
390 // Output auto initialization if not yet initialized
391 auto_init(input->info(), output->info(), pooled_w, pooled_h);
392
393 // Perform validation step
394 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100395
396 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000397 _input = input;
398 _output = output;
399 _pool_info = pool_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100400
Georgios Pinitas55186712018-01-08 17:37:12 +0000401 // Get data type
402 const DataType data_type = input->info()->data_type();
403
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100404 // Select appropriate function
Georgios Pinitas55186712018-01-08 17:37:12 +0000405 if(data_type == DataType::QS8)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100406 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000407 switch(pool_size)
408 {
409 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100410 switch(pool_type)
411 {
412 case PoolingType::AVG:
413 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
414 break;
415 case PoolingType::MAX:
416 _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
417 break;
418 default:
419 ARM_COMPUTE_ERROR("Unsupported pooling type!");
420 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000421 break;
422 case 3:
423 switch(pool_type)
424 {
425 case PoolingType::AVG:
426 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
427 break;
428 case PoolingType::MAX:
429 _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
430 break;
431 default:
432 ARM_COMPUTE_ERROR("Unsupported pooling type!");
433 }
434 break;
435 default:
436 ARM_COMPUTE_ERROR("Unsupported pooling size!");
437 }
438 }
439 else if(data_type == DataType::QASYMM8)
440 {
441 if(pool_size == 2 && pool_stride_x < 3)
442 {
443 switch(pool_type)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100444 {
Georgios Pinitas55186712018-01-08 17:37:12 +0000445 case PoolingType::AVG:
446 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
447 break;
448 case PoolingType::MAX:
449 _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
450 break;
451 default:
452 ARM_COMPUTE_ERROR("Unsupported pooling type!");
453 }
454 }
455 else if(pool_size == 3 && pool_stride_x < 3)
456 {
457 switch(pool_type)
458 {
459 case PoolingType::AVG:
460 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
461 break;
462 case PoolingType::MAX:
463 _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
464 break;
465 default:
466 ARM_COMPUTE_ERROR("Unsupported pooling type!");
467 }
468 }
469 else
470 {
471 switch(pool_type)
472 {
473 case PoolingType::AVG:
474 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
475 break;
476 case PoolingType::MAX:
477 _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
478 break;
479 default:
480 ARM_COMPUTE_ERROR("Unsupported pooling type!");
481 }
482 }
483 }
484 else if(data_type == DataType::QS16)
485 {
486 switch(pool_size)
487 {
488 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100489 switch(pool_type)
490 {
491 case PoolingType::AVG:
492 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
493 break;
494 case PoolingType::MAX:
495 _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
496 break;
497 default:
498 ARM_COMPUTE_ERROR("Unsupported pooling type!");
499 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000500 break;
501 case 3:
502 switch(pool_type)
503 {
504 case PoolingType::AVG:
505 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
506 break;
507 case PoolingType::MAX:
508 _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
509 break;
510 default:
511 ARM_COMPUTE_ERROR("Unsupported pooling type!");
512 }
513 break;
514 default:
515 ARM_COMPUTE_ERROR("Unsupported pooling size!");
516 }
517 }
518 else if(data_type == DataType::F16)
519 {
520 switch(pool_size)
521 {
522 case 2:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100523 switch(pool_type)
524 {
525 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000526 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100527 break;
528 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000529 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100530 break;
531 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000532 _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100533 break;
534 default:
535 ARM_COMPUTE_ERROR("Unsupported pooling type!");
536 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000537 break;
538 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100539 switch(pool_type)
540 {
541 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000542 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100543 break;
544 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000545 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100546 break;
547 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000548 _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100549 break;
550 default:
551 ARM_COMPUTE_ERROR("Unsupported pooling type!");
552 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000553 break;
554 default:
555 ARM_COMPUTE_ERROR("Unsupported pooling size!");
556 }
557 }
558 else if(data_type == DataType::F32)
559 {
560 switch(pool_size)
561 {
562 case 2:
563 switch(pool_type)
564 {
565 case PoolingType::AVG:
566 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
567 break;
568 case PoolingType::L2:
569 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
570 break;
571 case PoolingType::MAX:
572 _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
573 break;
574 default:
575 ARM_COMPUTE_ERROR("Unsupported pooling type!");
576 }
577 break;
578 case 3:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100579 switch(pool_type)
580 {
581 case PoolingType::AVG:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000582 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100583 break;
584 case PoolingType::L2:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000585 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100586 break;
587 case PoolingType::MAX:
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000588 _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
Georgios Pinitascdf51452017-08-31 14:21:36 +0100589 break;
590 default:
591 ARM_COMPUTE_ERROR("Unsupported pooling type!");
592 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000593 break;
594 case 7:
595 switch(pool_type)
596 {
597 case PoolingType::AVG:
598 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
599 break;
600 case PoolingType::L2:
601 _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
602 break;
603 case PoolingType::MAX:
604 _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
605 break;
606 default:
607 ARM_COMPUTE_ERROR("Unsupported pooling type!");
608 }
609 break;
610 default:
611 switch(pool_type)
612 {
613 case PoolingType::AVG:
614 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
615 break;
616 case PoolingType::L2:
617 _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
618 break;
619 case PoolingType::MAX:
620 _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
621 break;
622 default:
623 ARM_COMPUTE_ERROR("Unsupported pooling type!");
624 }
625 break;
626 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627 }
628
629 // Configure kernel window
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000630 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
631 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
632 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100633}
634
635template <PoolingType pooling_type>
636void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
637{
638 Iterator input(_input, window_input);
639 Iterator output(_output, window);
640
641 const int fixed_point_position = _input->info()->fixed_point_position();
642 constexpr int pool_size = 2;
643 int pool_pad_x = 0;
644 int pool_pad_y = 0;
645 int pool_stride_x = 0;
646 int pool_stride_y = 0;
647 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
648 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
649 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
650 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
651
652 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
653 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
654
655 execute_window_loop(window, [&](const Coordinates & id)
656 {
657 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
658 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100659 qint8x8_t lower_res = {};
660 qint8x8_t upper_res = {};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100661 if(pooling_type == PoolingType::AVG)
662 {
663 // Calculate scale
664 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
665 const qint8x8_t scale_vec = vdup_n_qs8(scale);
666
667 // Perform pooling
668 const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100669 lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
670 if(pool_stride_x == 1)
671 {
672 const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
673 upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
674 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100675 }
676 else
677 {
678 const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100679 lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
680 if(pool_stride_x == 1)
681 {
682 const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
683 upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
684 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100685 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100686 if(pool_stride_x == 1)
687 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100688 const qint8x8x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100689 vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
690 }
691 else
692 {
693 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
694 }
695 },
696 input, output);
697}
698
Georgios Pinitas55186712018-01-08 17:37:12 +0000699template <PoolingType pooling_type, bool exclude_padding>
700void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
701{
702 Iterator input(_input, window_input);
703 Iterator output(_output, window);
704
705 constexpr int pool_size = 2;
706 int pool_pad_x = 0;
707 int pool_pad_y = 0;
708 int pool_stride_x = 0;
709 int pool_stride_y = 0;
710 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
711 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
712 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
713 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
714
715 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
716 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
717
718 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
719
720 execute_window_loop(window, [&](const Coordinates & id)
721 {
722 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
723 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
724 uint8x8_t lower_res = {};
725 uint8x8_t upper_res = {};
726
727 if(pooling_type != PoolingType::MAX)
728 {
729 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
730 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
731
732 // Add rows
733 const uint16x8x2_t vrsum =
734 {
735 {
736 vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
737 vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
738 }
739 };
740
741 // Pair-wise add row data
742 const uint16x4x2_t vpsum =
743 {
744 {
745 vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
746 vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
747 }
748 };
749
750 uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
751
752 // Scale lower result
753 scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
754 pool_size, upper_bound_w, upper_bound_h,
755 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
756 lower_res = vmovn_u16(res_lower);
757
758 // Compute upper result for stride_x == 1
759 if(pool_stride_x == 1)
760 {
761 // Shifted row sum
762 const uint16x8x2_t vrsum_shifted =
763 {
764 {
765 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
766 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
767 }
768 };
769
770 // Pair-wise add shifted row
771 const uint16x4x2_t vpsum_shifted =
772 {
773 {
774 vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
775 vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
776 }
777 };
778 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
779
780 // Scale lower result
781 scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
782 pool_size, upper_bound_w, upper_bound_h,
783 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
784 upper_res = vmovn_u16(res_upper);
785 }
786 }
787 else
788 {
789 const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
790 lower_res = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
791 if(pool_stride_x == 1)
792 {
793 const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
794 upper_res = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
795 }
796 }
797
798 // Store result
799 if(pool_stride_x == 1)
800 {
801 const uint8x8x2_t res = { { lower_res, upper_res } };
802 vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
803 }
804 else
805 {
806 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
807 }
808 },
809 input, output);
810}
811
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100812template <PoolingType pooling_type>
813void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
814{
815 Iterator input(_input, window_input);
816 Iterator output(_output, window);
817
818 const int fixed_point_position = _input->info()->fixed_point_position();
819 constexpr int pool_size = 2;
820 int pool_pad_x = 0;
821 int pool_pad_y = 0;
822 int pool_stride_x = 0;
823 int pool_stride_y = 0;
824 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
825 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
826 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
827 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
828
829 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
830 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
831
832 execute_window_loop(window, [&](const Coordinates & id)
833 {
834 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
835 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
836 qint16x4_t lower_res = {};
837 qint16x4_t upper_res = {};
838 if(pooling_type == PoolingType::AVG)
839 {
840 // Calculate scale
841 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
842 const qint16x4_t scale_vec = vdup_n_qs16(scale);
843
844 // Perform pooling
845 const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
846 lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
847 if(pool_stride_x == 1)
848 {
849 const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
850 upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
851 }
852 }
853 else
854 {
855 const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
856 lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
857 if(pool_stride_x == 1)
858 {
859 const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
860 upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
861 }
862 }
863 if(pool_stride_x == 1)
864 {
Georgios Pinitasdc460f12017-08-24 19:02:44 +0100865 const qint16x4x2_t res = { { lower_res, upper_res } };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100866 vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
867 }
868 else
869 {
870 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
871 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100872 },
873 input, output);
874}
875
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000876template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100877void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
878{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000879#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100880 Iterator input(_input, window_input);
881 Iterator output(_output, window);
882
883 constexpr const int pool_size = 3;
884 int pool_pad_x = 0;
885 int pool_pad_y = 0;
886 int pool_stride_x = 0;
887 int pool_stride_y = 0;
888 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
889 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000890 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
891 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100892
893 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
894 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
895 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
896
897 execute_window_loop(window, [&](const Coordinates & id)
898 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100899 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
900 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
901 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
902 float16x4_t res = {};
903
904 // Get power of 2 in case of l2 pooling
905 if(pooling_type == PoolingType::L2)
906 {
907 top_data = vmul_f16(top_data, top_data);
908 middle_data = vmul_f16(middle_data, middle_data);
909 bottom_data = vmul_f16(bottom_data, bottom_data);
910 }
911
912 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100913 {
914 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000915 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100916 const float16x4_t scale_v = vdup_n_f16(scale);
917 // Perform pooling
918 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
919 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
920 res = vmul_f16(vpadd_f16(res, res), scale_v);
921 }
922 else
923 {
924 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
925 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
926 res = vpmax_f16(res, res);
927 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100928
929 // Calculate square-root in case of l2 pooling
930 if(pooling_type == PoolingType::L2)
931 {
932 res = vinv_f16(vinvsqrt_f16(res));
933 }
934
Pablo Tello0c34fe22017-06-26 17:17:42 +0100935 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
936 },
937 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000938#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100939 ARM_COMPUTE_UNUSED(window_input);
940 ARM_COMPUTE_UNUSED(window);
941 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000942#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100943}
944
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000945template <PoolingType pooling_type, bool exclude_padding>
Pablo Tello0c34fe22017-06-26 17:17:42 +0100946void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
947{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000948#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100949 Iterator input(_input, window_input);
950 Iterator output(_output, window);
951 constexpr int pool_size = 2;
952 int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
953 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
954 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000955 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
956 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100957
958 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
959 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
960
961 execute_window_loop(window, [&](const Coordinates & id)
962 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100963 auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
964 auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100965 float16x8_t res = {};
966
Georgios Pinitascdf51452017-08-31 14:21:36 +0100967 // Get power of 2 in case of l2 pooling
968 if(pooling_type == PoolingType::L2)
969 {
970 top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
971 top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
972 bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
973 bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
974 }
975
976 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100977 {
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000978 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100979 const float16x8_t scale_v = vdupq_n_f16(scale);
980 res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
981 }
982 else
983 {
984 res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
985 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100986
987 // Calculate square-root in case of l2 pooling
988 if(pooling_type == PoolingType::L2)
989 {
990 res = vinvq_f16(vinvsqrtq_f16(res));
991 }
992
993 // Store result
Pablo Tello0c34fe22017-06-26 17:17:42 +0100994 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
995 },
996 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000997#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100998 ARM_COMPUTE_UNUSED(window_input);
999 ARM_COMPUTE_UNUSED(window);
1000 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001001#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001002}
1003
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001004template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001005void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
1006{
1007 Iterator input(_input, window_input);
1008 Iterator output(_output, window);
1009
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001010 constexpr int pool_size = 2;
1011 int pool_pad_x = 0;
1012 int pool_pad_y = 0;
1013 int pool_stride_x = 0;
1014 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001015 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1016 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001017 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1018 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001019
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001020 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1021 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001022
1023 execute_window_loop(window, [&](const Coordinates & id)
1024 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001025 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1026 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1027 float32x2_t res = {};
1028 float final_res = 0;
1029
1030 // Get power of 2 in case of l2 pooling
1031 if(pooling_type == PoolingType::L2)
1032 {
1033 top_data = vmul_f32(top_data, top_data);
1034 bottom_data = vmul_f32(bottom_data, bottom_data);
1035 }
1036
1037 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001038 {
1039 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001040 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001041 const float32x2_t scale_v = vdup_n_f32(scale);
1042
1043 // Perform pooling
1044 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1045 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1046 }
1047 else
1048 {
1049 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1050 res = vpmax_f32(max_data, max_data);
1051 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001052 final_res = vget_lane_f32(res, 0);
1053
1054 // Calculate square-root in case of l2 pooling
1055 if(pooling_type == PoolingType::L2)
1056 {
1057 final_res = sqrt(final_res);
1058 }
1059
1060 // Store result
1061 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001062 },
1063 input, output);
1064}
1065
1066template <PoolingType pooling_type>
1067void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
1068{
1069 Iterator input(_input, window_input);
1070 Iterator output(_output, window);
1071
1072 const int fixed_point_position = _input->info()->fixed_point_position();
1073 constexpr int pool_size = 3;
1074 int pool_pad_x = 0;
1075 int pool_pad_y = 0;
1076 int pool_stride_x = 0;
1077 int pool_stride_y = 0;
1078 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1079 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1080 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
1081 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
1082
1083 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1084 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1085 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1086
1087 execute_window_loop(window, [&](const Coordinates & id)
1088 {
1089 const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
1090 const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
1091 const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
1092 qint8x8_t res = {};
1093 if(pooling_type == PoolingType::AVG)
1094 {
1095 // Calculate scale
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001096 const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001097
1098 // Perform pooling for stride 2
1099 const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
1100 const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
1101 const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
1102 const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
1103 if(pool_stride_x == 2)
1104 {
1105 const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
1106 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001107 const qint8x8_t scale_vec = vdup_n_qs8(scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001108 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001109 res = vqmul_qs8(res, scale_vec, fixed_point_position);
1110 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001111 }
1112 else
1113 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001114 const qint8x16_t scale_vec = vdupq_n_qs8(scale);
1115 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001116 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001117 }
1118 else
1119 {
1120 const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
1121 const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
1122 const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
1123 const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
1124
1125 if(pool_stride_x == 2)
1126 {
1127 const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
1128 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1129 res = vtbl2_s8(table, lookup_val);
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001130 vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001131 }
1132 else
1133 {
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001134 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001135 }
1136 }
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001137 },
1138 input, output);
1139}
1140
Georgios Pinitas55186712018-01-08 17:37:12 +00001141template <PoolingType pooling_type, bool exclude_padding>
1142void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
1143{
1144 Iterator input(_input, window_input);
1145 Iterator output(_output, window);
1146
1147 constexpr int pool_size = 3;
1148 int pool_pad_x = 0;
1149 int pool_pad_y = 0;
1150 int pool_stride_x = 0;
1151 int pool_stride_y = 0;
1152 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1153 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1154 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1155 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
1156
1157 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1158 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1159 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1160
1161 execute_window_loop(window, [&](const Coordinates & id)
1162 {
1163 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
1164 const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
1165 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
1166
1167 if(pooling_type == PoolingType::AVG)
1168 {
1169 // Convert data to u16
1170 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
1171 const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
1172 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
1173
1174 // Calculate row sums
1175 const uint16x8x2_t vrsum =
1176 {
1177 {
1178 vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
1179 vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
1180 }
1181 };
1182 const uint16x8x2_t vrsum_shifted_1 =
1183 {
1184 {
1185 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
1186 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
1187 }
1188 };
1189 const uint16x8x2_t vrsum_shifted_2 =
1190 {
1191 {
1192 vextq_u16(vrsum.val[0], vrsum.val[1], 2),
1193 vextq_u16(vrsum.val[1], vrsum.val[1], 2)
1194 }
1195 };
1196 // Calculate final sum
1197 uint16x8x2_t final_sum =
1198 {
1199 {
1200 vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1201 vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1202 }
1203 };
1204 if(pool_stride_x == 2)
1205 {
1206 uint16x8_t res =
1207 {
1208 vgetq_lane_u16(final_sum.val[0], 0),
1209 vgetq_lane_u16(final_sum.val[0], 2),
1210 vgetq_lane_u16(final_sum.val[0], 4),
1211 vgetq_lane_u16(final_sum.val[0], 6),
1212 vgetq_lane_u16(final_sum.val[1], 0),
1213 vgetq_lane_u16(final_sum.val[1], 2),
1214 vgetq_lane_u16(final_sum.val[1], 4),
1215 vgetq_lane_u16(final_sum.val[1], 6),
1216 };
1217
1218 scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
1219 pool_size, upper_bound_w, upper_bound_h,
1220 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1221 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
1222 }
1223 else
1224 {
1225 // Scale lower result
1226 scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
1227 pool_size, upper_bound_w, upper_bound_h,
1228 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1229 // Scale lower result
1230 scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
1231 pool_size, upper_bound_w, upper_bound_h,
1232 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1233 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
1234 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1235 }
1236 }
1237 else
1238 {
1239 const uint8x16_t max_data = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
1240 const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
1241 const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
1242 const uint8x16_t final_max = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
1243
1244 if(pool_stride_x == 2)
1245 {
1246 const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
1247 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1248 const uint8x8_t res = vtbl2_u8(table, lookup_val);
1249 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
1250 }
1251 else
1252 {
1253 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
1254 }
1255 }
1256 },
1257 input, output);
1258}
1259
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001260template <PoolingType pooling_type>
1261void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
1262{
1263 Iterator input(_input, window_input);
1264 Iterator output(_output, window);
1265
1266 const int fixed_point_position = _input->info()->fixed_point_position();
1267 constexpr int pool_size = 3;
1268 int pool_pad_x = 0;
1269 int pool_pad_y = 0;
1270 int pool_stride_x = 0;
1271 int pool_stride_y = 0;
1272 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1273 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1274 const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
1275 const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
1276
1277 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1278 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1279 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
1280
1281 execute_window_loop(window, [&](const Coordinates & id)
1282 {
1283 const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
1284 const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
1285 const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
1286
1287 if(pooling_type == PoolingType::AVG)
1288 {
1289 // Calculate scale
1290 const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
1291
1292 // Perform pooling for stride 2
1293 const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
1294 const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
1295 const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
1296 const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
1297 if(pool_stride_x == 2)
1298 {
1299 const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
1300 const qint16x4_t scale_vec = vdup_n_qs16(scale);
1301 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
1302 }
1303 else
1304 {
1305 const qint16x8_t scale_vec = vdupq_n_qs16(scale);
1306 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
1307 }
1308 }
1309 else
1310 {
1311 const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
1312 const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
1313 const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
1314 const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
1315
1316 if(pool_stride_x == 2)
1317 {
1318 const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
1319 vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
1320 }
1321 else
1322 {
1323 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
1324 }
1325 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001326 },
1327 input, output);
1328}
1329
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001330template <PoolingType pooling_type, bool exclude_padding>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001331void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
1332{
1333 Iterator input(_input, window_input);
1334 Iterator output(_output, window);
1335
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001336 constexpr const int pool_size = 3;
1337 int pool_pad_x = 0;
1338 int pool_pad_y = 0;
1339 int pool_stride_x = 0;
1340 int pool_stride_y = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001341 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1342 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001343 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1344 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001345
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001346 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
1347 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
1348 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001349
1350 execute_window_loop(window, [&](const Coordinates & id)
1351 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001352 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1353 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1354 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1355 float32x2_t res = {};
1356 float final_res = 0;
1357
1358 // Get power of 2 in case of l2 pooling
1359 if(pooling_type == PoolingType::L2)
1360 {
1361 top_data = vmulq_f32(top_data, top_data);
1362 middle_data = vmulq_f32(middle_data, middle_data);
1363 bottom_data = vmulq_f32(bottom_data, bottom_data);
1364 }
1365
1366 if(pooling_type != PoolingType::MAX)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001367 {
1368 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001369 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001370 const float32x2_t scale_v = vdup_n_f32(scale);
1371
1372 // Perform pooling
1373 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1374 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1375 res = vmul_f32(vpadd_f32(res, res), scale_v);
1376 }
1377 else
1378 {
1379 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1380 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1381 res = vpmax_f32(res, res);
1382 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001383 final_res = vget_lane_f32(res, 0);
1384
1385 // Calculate square-root in case of l2 pooling
1386 if(pooling_type == PoolingType::L2)
1387 {
1388 final_res = sqrt(final_res);
1389 }
1390
1391 // Store result
1392 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001393 },
1394 input, output);
1395}
1396
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001397template <PoolingType pooling_type, bool exclude_padding>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001398void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
1399{
1400 Iterator input(_input, window_input);
1401 Iterator output(_output, window);
1402
1403 constexpr const int pool_size = 7;
1404 int pool_pad_x = 0;
1405 int pool_pad_y = 0;
1406 int pool_stride_x = 0;
1407 int pool_stride_y = 0;
1408 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1409 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001410 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1411 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001412
1413 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1414 for(int i = 0; i < pool_size; ++i)
1415 {
1416 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
1417 }
1418
1419 execute_window_loop(window, [&](const Coordinates & id)
1420 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001421 float32x2_t res = {};
1422 float final_res = 0.f;
1423 if(pooling_type != PoolingType::MAX)
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001424 {
1425 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001426 float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001427 const float32x2_t scale_v = vdup_n_f32(scale);
1428
1429 // Perform pooling
Georgios Pinitascdf51452017-08-31 14:21:36 +01001430 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1431 // Get power of 2 in case of l2 pooling
1432 if(pooling_type == PoolingType::L2)
1433 {
1434 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1435 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1436 }
1437 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001438 for(int i = 1; i < pool_size; ++i)
1439 {
Georgios Pinitascdf51452017-08-31 14:21:36 +01001440 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1441 // Get power of 2 in case of l2 pooling
1442 if(pooling_type == PoolingType::L2)
1443 {
1444 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1445 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1446 }
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001447 sum_data = vaddq_f32(sum_data, data.val[0]);
1448 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1449 }
1450 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1451 res = vmul_f32(vpadd_f32(res, res), scale_v);
1452 }
1453 else
1454 {
1455 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1456 for(int i = 1; i < pool_size; ++i)
1457 {
1458 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1459 max_data = vmax2q_f32(max_data, data);
1460 }
1461 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1462 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1463 res = vpmax_f32(res, res);
1464 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001465 final_res = vget_lane_f32(res, 0);
1466
1467 // Calculate square-root in case of l2 pooling
1468 if(pooling_type == PoolingType::L2)
1469 {
1470 final_res = sqrt(final_res);
1471 }
1472
1473 // Store result
1474 *(reinterpret_cast<float *>(output.ptr())) = final_res;
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001475 },
1476 input, output);
1477}
1478
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001479template <PoolingType pooling_type, bool exclude_padding>
Gian Marco Iodice16824302017-09-28 15:41:37 +01001480void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
1481{
1482 Iterator input(_input, window_input);
1483 Iterator output(_output, window);
1484
Georgios Pinitas4c2dd542017-11-13 12:58:41 +00001485 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001486 int pool_pad_x = 0;
1487 int pool_pad_y = 0;
1488 int pool_stride_x = 0;
1489 int pool_stride_y = 0;
1490 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1491 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001492 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1493 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001494
1495 execute_window_loop(window, [&](const Coordinates & id)
1496 {
1497 float res = 0.0f;
1498
1499 if(pooling_type != PoolingType::MAX)
1500 {
1501 // Calculate scale
Georgios Pinitasadaae7e2017-10-30 15:56:32 +00001502 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001503
1504 // Perform pooling
1505 float32x4_t vres = vdupq_n_f32(0.0f);
1506
1507 for(int y = 0; y < pool_size; ++y)
1508 {
1509 int x = 0;
1510 for(; x <= (pool_size - 4); x += 4)
1511 {
1512 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1513 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1514
1515 // Get power of 2 in case of l2 pooling and accumulate
1516 if(pooling_type == PoolingType::L2)
1517 {
1518 vres = vmlaq_f32(vres, data, data);
1519 }
1520 else
1521 {
1522 vres = vaddq_f32(vres, data);
1523 }
1524 }
1525
1526 // Leftover for loop
1527 for(; x < pool_size; ++x)
1528 {
1529 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1530
1531 // Get power of 2 in case of l2 pooling
1532 if(pooling_type == PoolingType::L2)
1533 {
1534 data *= data;
1535 }
1536
1537 res += data;
1538 }
1539 }
1540
1541#if defined(__aarch64__)
1542 // Reduction operation available on 64 bit architectures only
1543 res += vaddvq_f32(vres);
1544#else // __aarch64__
1545 // Reduction
1546 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1547 tmp = vpadd_f32(tmp, tmp);
1548
1549 res += vget_lane_f32(tmp, 0);
1550#endif // __aarch64__
1551 // Divide by scale
1552 res *= scale;
1553 }
1554 else
1555 {
1556 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
1557 res = std::numeric_limits<float>::min();
1558
1559 for(int y = 0; y < pool_size; ++y)
1560 {
1561 int x = 0;
1562 for(; x <= (pool_size - 4); x += 4)
1563 {
1564 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
1565 (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1566 vres = vmaxq_f32(vres, data);
1567 }
1568
1569 // Leftover for loop
1570 for(; x < pool_size; ++x)
1571 {
1572 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1573 res = std::max(res, data);
1574 }
1575 }
1576
1577#if defined(__aarch64__)
1578 // Reduction operation available on 64 bit architectures only
1579 res = std::max(vmaxvq_f32(vres), res);
1580#else // __aarch64__
1581 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1582 tmp = vpmax_f32(tmp, tmp);
1583
1584 res = std::max(res, vget_lane_f32(tmp, 0));
1585#endif // __aarch64__
1586 }
1587
1588 // Calculate square-root in case of l2 pooling
1589 if(pooling_type == PoolingType::L2)
1590 {
1591 res = std::sqrt(res);
1592 }
1593
1594 // Store result
1595 *(reinterpret_cast<float *>(output.ptr())) = res;
1596 },
1597 input, output);
1598}
1599
Georgios Pinitas55186712018-01-08 17:37:12 +00001600template <PoolingType pooling_type, bool exclude_padding>
1601void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
1602{
1603 Iterator input(_input, window_input);
1604 Iterator output(_output, window);
1605
1606 const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
1607 int pool_pad_x = 0;
1608 int pool_pad_y = 0;
1609 int pool_stride_x = 0;
1610 int pool_stride_y = 0;
1611 std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
1612 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1613 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
1614 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
1615
1616 execute_window_loop(window, [&](const Coordinates & id)
1617 {
1618 uint8_t res = 0;
1619
1620 if(pooling_type != PoolingType::MAX)
1621 {
1622 uint32x4_t vres = vdupq_n_u32(0);
1623 uint32_t sres = 0;
1624
1625 // Calculate scale
1626 const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
1627
1628 // Perform pooling
1629 for(int y = 0; y < pool_size; ++y)
1630 {
1631 int x = 0;
1632 for(; x <= (pool_size - 8); x += 8)
1633 {
1634 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1635
1636 const uint16x8_t data_u16 = vmovl_u8(data);
1637 vres = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
1638 }
1639
1640 // Leftover for loop
1641 for(; x < pool_size; ++x)
1642 {
1643 uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1644 sres += data;
1645 }
1646 }
1647
1648 // Reduction
1649 const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
1650 sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
1651
1652 // Divide by scale
1653 res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
1654 }
1655 else
1656 {
1657 uint8x8_t vres = vdup_n_u8(0);
1658 res = 0;
1659
1660 for(int y = 0; y < pool_size; ++y)
1661 {
1662 int x = 0;
1663 for(; x <= (pool_size - 8); x += 8)
1664 {
1665 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1666 vres = vmax_u8(vres, data);
1667 }
1668
1669 // Leftover for loop
1670 for(; x < pool_size; ++x)
1671 {
1672 const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
1673 res = std::max(res, data);
1674 }
1675 }
1676
1677 // Reduce max
1678 vres = vpmax_u8(vres, vres);
1679 vres = vpmax_u8(vres, vres);
1680 vres = vpmax_u8(vres, vres);
1681
1682 // Get max value
1683 res = std::max(res, vget_lane_u8(vres, 0));
1684 }
1685
1686 // Store result
1687 *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
1688 },
1689 input, output);
1690}
1691
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001692Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1693{
1694 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1695
1696 unsigned int pooled_w = 0;
1697 unsigned int pooled_h = 0;
1698 unsigned int num_elems_processed_per_iteration = 0;
1699 BorderSize border_size(0);
1700
1701 const bool is_global_pooling = pool_info.is_global_pooling();
1702 const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
1703
1704 // Validate pool info befor calling scaled_dimensions
1705 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
1706
1707 // Check output dimensions
1708 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
1709 input->dimension(1),
1710 pool_size,
1711 pool_size,
1712 pool_info.pad_stride_info());
1713
1714 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
1715 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
1716
1717 return Status{};
1718}
1719
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001720void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001721{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001722 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001723 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1724 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1725 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1726
Pablo Tello0c34fe22017-06-26 17:17:42 +01001727 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1728 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
Georgios Pinitas55186712018-01-08 17:37:12 +00001729 const unsigned int pool_size = _pool_info.pool_size();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001730
1731 // Set step for input in x and y direction for the input
1732 Window window_input(window);
1733 unsigned int window_x_inc = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +01001734 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001735 {
Pablo Tello0c34fe22017-06-26 17:17:42 +01001736 case DataType::QS8:
Michalis Spyroubbd9fb92017-06-22 12:57:51 +01001737 case DataType::QS16:
Pablo Tello0c34fe22017-06-26 17:17:42 +01001738 case DataType::F16:
1739 {
1740 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1741 break;
1742 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001743 case DataType::QASYMM8:
1744 {
1745 window_x_inc = pool_stride_x;
1746 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
1747 {
1748 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1749 }
1750 break;
1751 }
Pablo Tello0c34fe22017-06-26 17:17:42 +01001752 case DataType::F32:
1753 {
1754 window_x_inc = pool_stride_x;
1755 break;
1756 }
1757 default:
1758 {
1759 ARM_COMPUTE_ERROR("Not supported");
1760 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001761 }
1762 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1763 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
1764
1765 // Run function
1766 (this->*_func)(window_input, window);
1767}