blob: fdbba815b4ee1dfa208a7d85ce0e866f8752887d [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002 * Copyright (c) 2017-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Georgios Pinitas55186712018-01-08 17:37:12 +000040#include "support/ToolchainSupport.h"
41
Manuel Bottinib4bb8272019-12-18 18:01:27 +000042#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <algorithm>
44#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010045#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010047#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048#include <string>
49#include <tuple>
50
Manuel Bottinib4bb8272019-12-18 18:01:27 +000051namespace arm_compute
52{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010053using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054
55namespace
56{
Pablo Tello77e6c552018-12-04 15:33:49 +000057inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
59{
Michalis Spyrou57dac842018-03-01 16:03:50 +000060 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
61 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
62
63 int start_x = id[idx_width] * stride_x - pad_x;
64 int start_y = id[idx_height] * stride_y - pad_y;
65
66 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
67 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000068 if(exclude_padding)
69 {
70 start_x = std::max(0, start_x);
71 start_y = std::max(0, start_y);
72 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 return 1.f / ((end_y - start_y) * (end_x - start_x));
74}
75
Manuel Bottinib4bb8272019-12-18 18:01:27 +000076template <typename T, typename TVec>
77inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000078 const int pool_size, const int upper_bound_w, const int upper_bound_h,
79 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
80{
81 int start_x = (id.x() + id_offset) * stride_x - pad_x;
82 int start_y = id.y() * stride_y - pad_y;
83 const int end_y = std::min(start_y + pool_size, upper_bound_h);
84 if(exclude_padding)
85 {
86 start_y = std::max(0, start_y);
87 }
88
Manuel Bottinib4bb8272019-12-18 18:01:27 +000089 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000090 {
91 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000092 wrapper::vgetlane(v, 0),
93 wrapper::vgetlane(v, 1),
94 wrapper::vgetlane(v, 2),
95 wrapper::vgetlane(v, 3),
96 wrapper::vgetlane(v, 4),
97 wrapper::vgetlane(v, 5),
98 wrapper::vgetlane(v, 6),
99 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000100 }
101 };
102
103 for(auto &el : elems)
104 {
105 int c_start_x = start_x;
106 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
107 if(exclude_padding)
108 {
109 c_start_x = std::max(0, c_start_x);
110 }
111 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
112 el *= scale;
113 start_x += step * stride_x;
114 }
115
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000116 v = wrapper::vsetlane(elems[0], v, 0);
117 v = wrapper::vsetlane(elems[1], v, 1);
118 v = wrapper::vsetlane(elems[2], v, 2);
119 v = wrapper::vsetlane(elems[3], v, 3);
120 v = wrapper::vsetlane(elems[4], v, 4);
121 v = wrapper::vsetlane(elems[5], v, 5);
122 v = wrapper::vsetlane(elems[6], v, 6);
123 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000124}
125
morgolockcc1f6c92020-03-24 09:26:48 +0000126Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
127 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100128{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000129 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100130
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000131 int pool_stride_x = 0;
132 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000133 PoolingType pool_type = pool_info.pool_type;
134 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100135 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100136
Anthony Barbiereaefd002018-07-20 17:49:35 +0100137 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000138 if(indices)
139 {
140 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
141 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
142 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000144 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000145 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
146 && (input->data_layout() == DataLayout::NHWC),
147 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000148
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000149 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100150 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000151 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000152 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
153 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
154 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000155
156 if(indices)
157 {
158 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
159 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "Pool indices only supported in NCHW");
160 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
161 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
162 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100163 }
164
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000165 return Status{};
166}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100167
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000168Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000169{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000170 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
171 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000172
173 return Status{};
174}
175
morgolockcc1f6c92020-03-24 09:26:48 +0000176std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
177 unsigned int &num_elems_processed_per_iteration,
178 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000179 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100181 // Output auto inizialitation if not yet initialized
182 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000183 if(indices)
184 {
185 // Indices auto inizialitation if not yet initialized
186 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info))).set_data_type(DataType::U32) /* we store the offset to the element */);
187 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000188 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000189 unsigned int num_elems_read_per_iteration = 0;
190 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000191 int pool_stride_x = 0;
192 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000193 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
194 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
195 const int input_width = input->dimension(idx_width);
196 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000197 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000198 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000199 const int pool_pad_right = pad_stride_info.pad_right();
200 const int pool_pad_top = pad_stride_info.pad_top();
201 const int pool_pad_left = pad_stride_info.pad_left();
202 const int pool_pad_bottom = pad_stride_info.pad_bottom();
203 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000204
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000205 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000206 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
207 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000208 pool_size_x,
209 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000210 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100211
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000212 //If it's not squared and optimized will be executed the MxN
213 num_elems_read_per_iteration = 1;
214 num_elems_processed_per_iteration = 1;
215 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100216
Michalis Spyrou57dac842018-03-01 16:03:50 +0000217 const bool is_nhwc = data_layout == DataLayout::NHWC;
218
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000219 if(is_square)
220 {
221 switch(input->data_type())
222 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000223 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000224 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000225 if(is_nhwc)
226 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100227 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000228 break;
229 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000230 switch(pool_size_x)
231 {
232 case 2:
233 num_elems_read_per_iteration = 16;
234 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
235 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
236 break;
237 case 3:
238 num_elems_read_per_iteration = 16;
239 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
240 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
241 break;
242 default:
243 break;
244 }
245 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000246#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
247 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000248 if(is_nhwc)
249 {
250 num_elems_processed_per_iteration = 8;
251 break;
252 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000253 switch(pool_size_x)
254 {
255 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000256 case 3:
257 num_elems_read_per_iteration = 4;
258 num_elems_processed_per_iteration = 1;
259 num_elems_horizontal_window = 1;
260 break;
261 default:
262 break;
263 }
264 break;
265#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
266 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000267 if(is_nhwc)
268 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100269 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000270 break;
271 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000272 switch(pool_size_x)
273 {
274 case 2:
275 num_elems_read_per_iteration = 2;
276 break;
277 case 3:
278 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
279 break;
280 case 7:
281 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
282 break;
283 default:
284 break;
285 }
286 num_elems_processed_per_iteration = 1;
287 num_elems_horizontal_window = 1;
288 break;
289 default:
290 ARM_COMPUTE_ERROR("Element size not supported");
291 break;
292 }
293 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000294 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000295 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000296 if(is_nhwc)
297 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100298 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000299 }
300 }
301
302 bool window_changed = false;
303 Window win{};
304 if(data_layout == DataLayout::NCHW)
305 {
306 // Number of iterations in X dimension
307 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000308 // Upper limit for the number of right/bottom border elements that are accessed
309 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
310 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000311 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
312 border_size.right = std::max(upper_bound_w, pool_pad_right);
313 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000314 TensorShape output_shape{ input->tensor_shape() };
315 output_shape.set(0, pooled_w);
316 output_shape.set(1, pooled_h);
317 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000318 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000319 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000320 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000321 if(indices)
322 {
323 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
324 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
325 }
326 else
327 {
328 window_changed = update_window_and_padding(win, input_access, output_access);
329 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000330 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
331 }
332 else
333 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000334 TensorShape output_shape{ input->tensor_shape() };
335 output_shape.set(1, pooled_w);
336 output_shape.set(2, pooled_h);
337 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000338 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
339 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000340 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
morgolockcc1f6c92020-03-24 09:26:48 +0000341 if(indices)
342 {
343 AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
344 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
345 }
346 else
347 {
348 window_changed = update_window_and_padding(win, input_access, output_access);
349 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000350 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000351 }
352
353 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
354 return std::make_pair(err, win);
355}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000356
357template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000358inline T vcvtq_q32_f32(float32x4_t values);
359
360template <>
361inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
362{
363 return vcvtq_u32_f32(values);
364}
365
366template <>
367inline int32x4_t vcvtq_q32_f32(float32x4_t values)
368{
369 return vcvtq_s32_f32(values);
370}
371
372template <typename T>
373inline float32x4_t vcvtq_f32_q32(T values);
374
375template <>
376inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
377{
378 return vcvtq_f32_u32(values);
379}
380
381template <>
382inline float32x4_t vcvtq_f32_q32(int32x4_t values)
383{
384 return vcvtq_f32_s32(values);
385}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000386
387template <typename Tout>
388inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
389
390template <>
391inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
392{
393 const float new_scale = quant_rescale / scale_pooling;
394 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
395}
396
397template <>
398inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
399{
400 const float new_scale = quant_rescale / scale_pooling;
401 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
402}
403
404template <typename Tin, typename Tout>
405inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
406
407template <>
408inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
409{
410 const float32x4x4_t acc =
411 {
412 {
413 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
414 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
415 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
416 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
417 }
418 };
419 return vquantize(acc, requant_qinfo);
420}
421
422template <>
423inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
424{
425 const float32x4x4_t acc =
426 {
427 {
428 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
429 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
430 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
431 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
432 }
433 };
434 return vquantize_signed(acc, requant_qinfo);
435}
436
437template <typename T>
438inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
439
440template <>
441inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
442{
443 const float32x4x2_t acc =
444 {
445 {
446 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
447 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
448 }
449 };
450 return vquantize(acc, requant_qinfo);
451}
452
453template <>
454inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
455{
456 const float32x4x2_t acc =
457 {
458 {
459 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
460 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
461 }
462 };
463 return vquantize_signed(acc, requant_qinfo);
464}
465
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000466} // namespace
467
468NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000469 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000470{
471}
472
473BorderSize NEPoolingLayerKernel::border_size() const
474{
475 return _border_size;
476}
477
morgolockcc1f6c92020-03-24 09:26:48 +0000478void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000479{
480 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000481 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
482 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000483 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000484
485 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000486 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
487 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
488 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000489
490 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000491 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000492 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
493 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000494
495 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000496 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000497
498 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100499 unsigned int pooled_w;
500 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000501 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
502 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000503 pool_size.x(),
504 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000505 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000506
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000507 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000508 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100509
510 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000511 _input = input;
512 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000513 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000514 _pool_info = pool_info;
515 _data_layout = input->info()->data_layout();
516 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100517
Georgios Pinitas55186712018-01-08 17:37:12 +0000518 // Get data type
519 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000520 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000521
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100522 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000523 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000524 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000525 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000526 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100527 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000528 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000529 }
530 else
531 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000532 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000533 }
534 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000535 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000536 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000537 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000538 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000539 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000540 }
541 else
542 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000543 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000544 }
545 }
546 else
547 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000548 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000549 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000550 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000551 }
552 else
553 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000554 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
555 }
556 }
557 }
558 else if(data_type == DataType::QASYMM8_SIGNED)
559 {
560 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
561 {
562 if(is_nchw)
563 {
564 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
565 }
566 else
567 {
568 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
569 }
570 }
571 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
572 {
573 if(is_nchw)
574 {
575 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
576 }
577 else
578 {
579 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
580 }
581 }
582 else
583 {
584 if(is_nchw)
585 {
586 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
587 }
588 else
589 {
590 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000591 }
592 }
593 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000594 else if(data_type == DataType::F16)
595 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000596 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000597 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000598 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000599 {
600 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000601 {
602 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000603 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000604 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000605 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000606 else
607 {
608 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
609 }
610 }
611 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000612 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000613 {
614 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000615 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000616 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000617 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000618 else
619 {
620 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
621 }
622 }
623 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000624 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000625 {
626 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000627 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000628 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
629 }
630 else
631 {
632 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000633 }
634 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000635 }
636 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000637 }
638 }
639 else
640 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000641 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000642 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000643 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
644 }
645 else
646 {
647 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000648 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000649 }
650 }
651 else if(data_type == DataType::F32)
652 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000653 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000654 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000655 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000656 {
657 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000658 {
659 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000660 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000661 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
662 }
663 else
664 {
665 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000666 }
667 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000668 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000669 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000670 {
671 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000672 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000673 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
674 }
675 else
676 {
677 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000678 }
679 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000680 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000681 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000682 {
683 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000684 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000685 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
686 }
687 else
688 {
689 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000690 }
691 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000692 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000693 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000694 {
695 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000696 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000697 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
698 }
699 else
700 {
701 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000702 }
703 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000704 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000705 }
706 }
707 else
708 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000709 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000710 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000711 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
712 }
713 else
714 {
715 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000716 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000717 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100718 }
719
720 // Configure kernel window
morgolockcc1f6c92020-03-24 09:26:48 +0000721 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
722 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000723 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
724 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100725}
726
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000727template <typename T>
728void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000729{
730 Iterator input(_input, window_input);
731 Iterator output(_output, window);
732
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000733 /** NEON vector types */
734 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
735 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
736 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
737 using q16_t = typename wrapper::traits::promote_t<T>;
738 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
739 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
740 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
741
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000742 constexpr int pool_size = 2;
743 int pool_stride_x = 0;
744 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000745 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
746 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
747 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
748 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
749 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000750 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
751 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000752
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000753 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
754 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000755
756 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
757
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100758 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
759 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
760 const bool have_different_qinfo = input_qinfo != output_qinfo;
761
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000762 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
763 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
764 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
765
Georgios Pinitas55186712018-01-08 17:37:12 +0000766 execute_window_loop(window, [&](const Coordinates & id)
767 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000768 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
769 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
770 q8x8_t lower_res = {};
771 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000772
773 if(pooling_type != PoolingType::MAX)
774 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000775 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
776 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000777
778 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000779 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000780 {
781 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000782 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
783 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000784 }
785 };
786
787 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000788 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
789 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000790
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000791 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000792
793 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000794 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
795 pool_size, upper_bound_w, upper_bound_h,
796 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
797 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000798
799 // Compute upper result for stride_x == 1
800 if(pool_stride_x == 1)
801 {
802 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000803 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000804 {
805 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000806 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
807 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000808 }
809 };
810
811 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000812 q16x8_t res_upper = wrapper::vcombine(
813 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
814 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000815
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000816 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000817 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
818 pool_size, upper_bound_w, upper_bound_h,
819 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
820 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000821 }
822 }
823 else
824 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000825 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
826 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000827 if(pool_stride_x == 1)
828 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000829 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
830 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000831 }
832 }
833
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100834 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100835 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000836 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000837 lower_res = wrapper::vgetlow(requantized_output);
838 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100839 }
840
Georgios Pinitas55186712018-01-08 17:37:12 +0000841 // Store result
842 if(pool_stride_x == 1)
843 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000844 const q8x8x2_t res = { { lower_res, upper_res } };
845 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000846 }
847 else
848 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000849 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000850 }
851 },
852 input, output);
853}
854
Pablo Tello77e6c552018-12-04 15:33:49 +0000855void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100856{
Pablo Tello77e6c552018-12-04 15:33:49 +0000857 ARM_COMPUTE_UNUSED(pooling_type);
858 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000859#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100860 Iterator input(_input, window_input);
861 Iterator output(_output, window);
862
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000863 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000864 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
865 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
866 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
867 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000868 int pool_stride_x = 0;
869 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000870 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000871 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
872 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100873
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000874 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
875 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
876 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100877
878 execute_window_loop(window, [&](const Coordinates & id)
879 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100880 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
881 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
882 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
883 float16x4_t res = {};
884
885 // Get power of 2 in case of l2 pooling
886 if(pooling_type == PoolingType::L2)
887 {
888 top_data = vmul_f16(top_data, top_data);
889 middle_data = vmul_f16(middle_data, middle_data);
890 bottom_data = vmul_f16(bottom_data, bottom_data);
891 }
892
893 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100894 {
895 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000896 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100897 const float16x4_t scale_v = vdup_n_f16(scale);
898 // Perform pooling
899 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
900 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
901 res = vmul_f16(vpadd_f16(res, res), scale_v);
902 }
903 else
904 {
905 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
906 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
907 res = vpmax_f16(res, res);
908 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100909
910 // Calculate square-root in case of l2 pooling
911 if(pooling_type == PoolingType::L2)
912 {
913 res = vinv_f16(vinvsqrt_f16(res));
914 }
915
Pablo Tello0c34fe22017-06-26 17:17:42 +0100916 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
917 },
918 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000919#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100920 ARM_COMPUTE_UNUSED(window_input);
921 ARM_COMPUTE_UNUSED(window);
922 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000923#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100924}
925
Pablo Tello77e6c552018-12-04 15:33:49 +0000926void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100927{
Pablo Tello77e6c552018-12-04 15:33:49 +0000928 ARM_COMPUTE_UNUSED(pooling_type);
929 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000930#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100931 Iterator input(_input, window_input);
932 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000933 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000934 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
935 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
936 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
937 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000938 int pool_stride_x, pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000939 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000940 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
941 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100942
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000943 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
944 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100945
946 execute_window_loop(window, [&](const Coordinates & id)
947 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100948 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
949 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
950 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100951
Georgios Pinitascdf51452017-08-31 14:21:36 +0100952 // Get power of 2 in case of l2 pooling
953 if(pooling_type == PoolingType::L2)
954 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100955 top_data = vmul_f16(top_data, top_data);
956 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100957 }
958
959 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100960 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000961 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100962 const float16x4_t scale_v = vdup_n_f16(scale);
963
964 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
965 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100966 }
967 else
968 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100969 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
970 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100971 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100972
973 // Calculate square-root in case of l2 pooling
974 if(pooling_type == PoolingType::L2)
975 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100976 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100977 }
978
979 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100980 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100981 },
982 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000983#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100984 ARM_COMPUTE_UNUSED(window_input);
985 ARM_COMPUTE_UNUSED(window);
986 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000987#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100988}
989
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000990template <typename T>
991void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000992{
993 Iterator input(_input, window_input);
994 Iterator output(_output, window);
995
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000996 /** NEON vector types */
997 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
998 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
999 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1000 using q16_t = typename wrapper::traits::promote_t<T>;
1001 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1002 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1003
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001004 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001005 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1006 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1007 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1008 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001009 int pool_stride_x = 0;
1010 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001011 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001012 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1013 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001014
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001015 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1016 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001017
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001018 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1019 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1020 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1021
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001022 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1023 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1024 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001025
1026 execute_window_loop(window, [&](const Coordinates & id)
1027 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001028 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1029 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1030 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1031 q8x8_t fres = {};
1032 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001033
1034 if(pooling_type == PoolingType::AVG)
1035 {
1036 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001037 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1038 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1039 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001040
1041 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001042 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001043 {
1044 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001045 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1046 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001047 }
1048 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001049 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001050 {
1051 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001052 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1053 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001054 }
1055 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001056 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001057 {
1058 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001059 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1060 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001061 }
1062 };
1063 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001064 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001065 {
1066 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001067 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1068 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001069 }
1070 };
1071 if(pool_stride_x == 2)
1072 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001073 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001074 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001075 wrapper::vgetlane(final_sum.val[0], 0),
1076 wrapper::vgetlane(final_sum.val[0], 2),
1077 wrapper::vgetlane(final_sum.val[0], 4),
1078 wrapper::vgetlane(final_sum.val[0], 6),
1079 wrapper::vgetlane(final_sum.val[1], 0),
1080 wrapper::vgetlane(final_sum.val[1], 2),
1081 wrapper::vgetlane(final_sum.val[1], 4),
1082 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001083 };
1084
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001085 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1086 pool_size, upper_bound_w, upper_bound_h,
1087 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1088 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001089 }
1090 else
1091 {
1092 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001093 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1094 pool_size, upper_bound_w, upper_bound_h,
1095 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001096 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001097 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1098 pool_size, upper_bound_w, upper_bound_h,
1099 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1100 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001101 }
1102 }
1103 else
1104 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001105 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1106 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1107 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1108 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001109
1110 if(pool_stride_x == 2)
1111 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001112 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1113 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1114 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001115 }
1116 else
1117 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001118 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001119 }
1120 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001121
1122 // Store result
1123 if(pool_stride_x == 1)
1124 {
1125 if(input_qinfo != output_qinfo)
1126 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001127 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001128 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001129 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001130 }
1131 else
1132 {
1133 if(input_qinfo != output_qinfo)
1134 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001135 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001136 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001137 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001138 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001139 },
1140 input, output);
1141}
1142
Pablo Tello77e6c552018-12-04 15:33:49 +00001143void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001144{
Pablo Tello77e6c552018-12-04 15:33:49 +00001145 ARM_COMPUTE_UNUSED(pooling_type);
1146 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001147#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1148 Iterator input(_input, window_input);
1149 Iterator output(_output, window);
1150
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001151 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1152 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1153 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1154 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1155 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1156 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001157 int pool_stride_x = 0;
1158 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001159 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001160 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1161 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1162
1163 execute_window_loop(window, [&](const Coordinates & id)
1164 {
1165 float16_t res = 0.0f;
1166 float16x8_t vres = vdupq_n_f16(0.0f);
1167
1168 if(pooling_type != PoolingType::MAX)
1169 {
1170 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001171 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001172
1173 // Perform pooling
1174
1175 for(int y = 0; y < pool_size_y; ++y)
1176 {
1177 int x = 0;
1178 for(; x <= (pool_size_x - 8); x += 8)
1179 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001180 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1181 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001182
1183 // Get power of 2 in case of l2 pooling and accumulate
1184 if(pooling_type == PoolingType::L2)
1185 {
1186 vres = vaddq_f16(vres, vmulq_f16(data, data));
1187 }
1188 else
1189 {
1190 vres = vaddq_f16(vres, data);
1191 }
1192 }
1193
1194 // Leftover for loop
1195 for(; x < pool_size_x; ++x)
1196 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001197 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1198 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001199
1200 // Get power of 2 in case of l2 pooling
1201 if(pooling_type == PoolingType::L2)
1202 {
1203 data *= data;
1204 }
1205
1206 res += data;
1207 }
1208 }
1209
1210 // Reduction
1211 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1212 res += vget_lane_f16(tmp, 0);
1213 res += vget_lane_f16(tmp, 1);
1214 res += vget_lane_f16(tmp, 2);
1215 res += vget_lane_f16(tmp, 3);
1216
1217 // Divide by scale
1218 res *= scale;
1219 }
1220 else
1221 {
1222 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1223 res = std::numeric_limits<float>::lowest();
1224
1225 for(int y = 0; y < pool_size_y; ++y)
1226 {
1227 int x = 0;
1228 for(; x <= (pool_size_x - 8); x += 8)
1229 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001230 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1231 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001232 vres = vmaxq_f16(vres, data);
1233 }
1234
1235 // Leftover for loop
1236 for(; x < pool_size_x; ++x)
1237 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001238 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1239 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1240 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001241 }
1242 }
1243
1244 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1245 res = std::max(res, vget_lane_f16(tmp, 0));
1246 res = std::max(res, vget_lane_f16(tmp, 1));
1247 res = std::max(res, vget_lane_f16(tmp, 2));
1248 res = std::max(res, vget_lane_f16(tmp, 3));
1249 }
1250
1251 // Calculate square-root in case of l2 pooling
1252 if(pooling_type == PoolingType::L2)
1253 {
1254 res = std::sqrt(res);
1255 }
1256
1257 // Store result
1258 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1259 },
1260 input, output);
1261
1262#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1263 ARM_COMPUTE_UNUSED(window_input);
1264 ARM_COMPUTE_UNUSED(window);
1265 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1266#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1267}
1268
Pablo Tello77e6c552018-12-04 15:33:49 +00001269void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001270{
Pablo Tello77e6c552018-12-04 15:33:49 +00001271 ARM_COMPUTE_UNUSED(pooling_type);
1272 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001273#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1274 Iterator input(_input, window_input);
1275 Iterator output(_output, window);
1276
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001277 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1278 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1279 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1280 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1281 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1282 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001283 int pool_stride_x = 0;
1284 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001285 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001286 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1287 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1288
1289 float16x8_t vres;
1290
1291 execute_window_loop(window, [&](const Coordinates & id)
1292 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001293 const int idx_width = id.y() * pool_stride_x;
1294 const int idx_height = id.z() * pool_stride_y;
1295 const int pool_limit_y = pool_pad_top - idx_height;
1296 const int pool_limit_x = pool_pad_left - idx_width;
1297
1298 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1299 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1300 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1301 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1302
Michalis Spyrou57dac842018-03-01 16:03:50 +00001303 if(pooling_type != PoolingType::MAX)
1304 {
1305 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001306 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1307 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001308 const float16x8_t scale_v = vdupq_n_f16(scale);
1309
1310 // Perform pooling
1311 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001312 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001313 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001314 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001315 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001316 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1317 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001318
1319 // Get power of 2 in case of l2 pooling and accumulate
1320 if(pooling_type == PoolingType::L2)
1321 {
1322 vres = vaddq_f16(vres, vmulq_f16(data, data));
1323 }
1324 else
1325 {
1326 vres = vaddq_f16(vres, data);
1327 }
1328 }
1329 }
1330 // Divide by scale
1331 vres = vmulq_f16(vres, scale_v);
1332 }
1333 else
1334 {
1335 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001336
1337 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001338 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001339 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001340 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001341 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1342 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001343 vres = vmaxq_f16(vres, data);
1344 }
1345 }
1346 }
1347
1348 // Calculate square-root in case of l2 pooling
1349 if(pooling_type == PoolingType::L2)
1350 {
1351 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1352 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1353 }
1354
1355 // Store result
1356 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1357 },
1358 input, output);
1359
1360#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1361 ARM_COMPUTE_UNUSED(window_input);
1362 ARM_COMPUTE_UNUSED(window);
1363 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1364#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1365}
1366
Pablo Tello77e6c552018-12-04 15:33:49 +00001367void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001368{
1369 Iterator input(_input, window_input);
1370 Iterator output(_output, window);
1371
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001372 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1373 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1374 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1375 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1376 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1377 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001378 int pool_stride_x = 0;
1379 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001380 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001381 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1382 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001383
1384 execute_window_loop(window, [&](const Coordinates & id)
1385 {
1386 float res = 0.0f;
1387
1388 if(pooling_type != PoolingType::MAX)
1389 {
1390 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001391 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001392
1393 // Perform pooling
1394 float32x4_t vres = vdupq_n_f32(0.0f);
1395
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001396 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001397 {
1398 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001399 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001400 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001401 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1402 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001403
1404 // Get power of 2 in case of l2 pooling and accumulate
1405 if(pooling_type == PoolingType::L2)
1406 {
1407 vres = vmlaq_f32(vres, data, data);
1408 }
1409 else
1410 {
1411 vres = vaddq_f32(vres, data);
1412 }
1413 }
1414
1415 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001416 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001417 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001418 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1419 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001420
1421 // Get power of 2 in case of l2 pooling
1422 if(pooling_type == PoolingType::L2)
1423 {
1424 data *= data;
1425 }
1426
1427 res += data;
1428 }
1429 }
1430
1431#if defined(__aarch64__)
1432 // Reduction operation available on 64 bit architectures only
1433 res += vaddvq_f32(vres);
1434#else // __aarch64__
1435 // Reduction
1436 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1437 tmp = vpadd_f32(tmp, tmp);
1438
1439 res += vget_lane_f32(tmp, 0);
1440#endif // __aarch64__
1441 // Divide by scale
1442 res *= scale;
1443 }
1444 else
1445 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001446 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1447 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001448
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001449 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001450 {
1451 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001452 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001453 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001454 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1455 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001456 vres = vmaxq_f32(vres, data);
1457 }
1458
1459 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001460 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001461 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001462 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1463 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001464 res = std::max(res, data);
1465 }
1466 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001467#if defined(__aarch64__)
1468 // Reduction operation available on 64 bit architectures only
1469 res = std::max(vmaxvq_f32(vres), res);
1470#else // __aarch64__
1471 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1472 tmp = vpmax_f32(tmp, tmp);
1473
1474 res = std::max(res, vget_lane_f32(tmp, 0));
1475#endif // __aarch64__
1476 }
1477
1478 // Calculate square-root in case of l2 pooling
1479 if(pooling_type == PoolingType::L2)
1480 {
1481 res = std::sqrt(res);
1482 }
1483
1484 // Store result
1485 *(reinterpret_cast<float *>(output.ptr())) = res;
1486 },
1487 input, output);
1488}
1489
morgolockcc1f6c92020-03-24 09:26:48 +00001490void NEPoolingLayerKernel::pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window)
Pablo Tello77e6c552018-12-04 15:33:49 +00001491{
morgolockcc1f6c92020-03-24 09:26:48 +00001492 Iterator input(_input, window_input);
1493 Iterator output(_output, window);
1494 Iterator indices(_indices, window);
1495 int final_index = 0;
1496 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1497 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1498 int pool_stride_x = 0;
1499 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001500 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001501 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1502 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1503
morgolockcc1f6c92020-03-24 09:26:48 +00001504 const Strides &input_strides = _input->info()->strides_in_bytes();
1505 const auto in_stridew = input_strides[1];
1506
1507 execute_window_loop(window, [&](const Coordinates &)
Pablo Tello77e6c552018-12-04 15:33:49 +00001508 {
morgolockcc1f6c92020-03-24 09:26:48 +00001509 const auto input_offset_top = input_top_ptr + input.offset();
1510 const auto input_offset_bottom = input_bottom_ptr + input.offset();
1511 const auto in_top_ptr = reinterpret_cast<const float *>(input_offset_top);
1512 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_offset_bottom);
1513 float32x2_t top_data = vld1_f32(in_top_ptr);
1514 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1515 float32x2_t res = {};
1516 float final_res = 0;
1517 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1518 res = vpmax_f32(max_data, max_data);
1519 final_res = vget_lane_f32(res, 0);
Pablo Tello77e6c552018-12-04 15:33:49 +00001520 // Store result
1521 *(reinterpret_cast<float *>(output.ptr())) = final_res;
morgolockcc1f6c92020-03-24 09:26:48 +00001522 const uint32_t offset_top = (uint32_t)(input.offset() / sizeof(float));
1523 const uint32_t offset_bottom = (uint32_t)offset_top + (in_stridew / sizeof(float));
1524 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
1525 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
1526 const uint32x2_t tmp_indices = vbsl_u32(vcgt_f32(top_data, bottom_data), voffset_top, voffset_bottom);
1527 final_index = vget_lane_u32(vbsl_u32(vcgt_f32(max_data, vrev64_f32(max_data)), tmp_indices, vrev64_u32(tmp_indices)), 0);
1528 *(reinterpret_cast<int *>(indices.ptr())) = final_index;
Pablo Tello77e6c552018-12-04 15:33:49 +00001529 },
morgolockcc1f6c92020-03-24 09:26:48 +00001530 input, output, indices);
1531}
1532
1533void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1534 bool exclude_padding)
1535{
1536 if(pooling_type == PoolingType::MAX && _indices)
1537 {
1538 pooling2_f32_nchw_maxpool_indices(window_input, window);
1539 }
1540 else
1541 {
1542 Iterator input(_input, window_input);
1543 Iterator output(_output, window);
1544 constexpr int pool_size = 2;
1545 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1546 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1547 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1548 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1549 int pool_stride_x = 0;
1550 int pool_stride_y = 0;
1551 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1552 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1553 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1554
1555 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1556 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1557
1558 execute_window_loop(window, [&](const Coordinates & id)
1559 {
1560 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1561 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1562 float32x2_t top_data = vld1_f32(in_top_ptr);
1563 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1564 float32x2_t res = {};
1565 float final_res = 0;
1566 // Get power of 2 in case of l2 pooling
1567 if(pooling_type == PoolingType::L2)
1568 {
1569 top_data = vmul_f32(top_data, top_data);
1570 bottom_data = vmul_f32(bottom_data, bottom_data);
1571 }
1572
1573 if(pooling_type != PoolingType::MAX)
1574 {
1575 // Calculate scale
1576 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1577 const float32x2_t scale_v = vdup_n_f32(scale);
1578
1579 // Perform pooling
1580 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1581 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1582 }
1583 else
1584 {
1585 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1586 res = vpmax_f32(max_data, max_data);
1587 }
1588 final_res = vget_lane_f32(res, 0);
1589
1590 // Calculate square-root in case of l2 pooling
1591 if(pooling_type == PoolingType::L2)
1592 {
1593 final_res = sqrt(final_res);
1594 }
1595
1596 // Store result
1597 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1598 },
1599 input, output);
1600 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001601}
1602
1603void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1604{
1605 Iterator input(_input, window_input);
1606 Iterator output(_output, window);
1607
1608 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001609 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1610 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1611 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1612 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001613 int pool_stride_x = 0;
1614 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001615 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001616 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1617 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1618
1619 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1620 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1621 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1622
1623 execute_window_loop(window, [&](const Coordinates & id)
1624 {
1625 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1626 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1627 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1628 float32x2_t res = {};
1629 float final_res = 0;
1630
1631 // Get power of 2 in case of l2 pooling
1632 if(pooling_type == PoolingType::L2)
1633 {
1634 top_data = vmulq_f32(top_data, top_data);
1635 middle_data = vmulq_f32(middle_data, middle_data);
1636 bottom_data = vmulq_f32(bottom_data, bottom_data);
1637 }
1638
1639 if(pooling_type != PoolingType::MAX)
1640 {
1641 // Calculate scale
1642 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1643 const float32x2_t scale_v = vdup_n_f32(scale);
1644
1645 // Perform pooling
1646 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1647 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1648 res = vmul_f32(vpadd_f32(res, res), scale_v);
1649 }
1650 else
1651 {
1652 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1653 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1654 res = vpmax_f32(res, res);
1655 }
1656 final_res = vget_lane_f32(res, 0);
1657
1658 // Calculate square-root in case of l2 pooling
1659 if(pooling_type == PoolingType::L2)
1660 {
1661 final_res = sqrt(final_res);
1662 }
1663
1664 // Store result
1665 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1666 },
1667 input, output);
1668}
1669
1670void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1671{
1672 Iterator input(_input, window_input);
1673 Iterator output(_output, window);
1674
1675 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001676 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1677 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1678 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1679 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001680 int pool_stride_x = 0;
1681 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001682 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001683 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1684 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1685
1686 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1687 for(int i = 0; i < pool_size; ++i)
1688 {
1689 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1690 }
1691
1692 execute_window_loop(window, [&](const Coordinates & id)
1693 {
1694 float32x2_t res = {};
1695 float final_res = 0.f;
1696 if(pooling_type != PoolingType::MAX)
1697 {
1698 // Calculate scale
1699 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1700 const float32x2_t scale_v = vdup_n_f32(scale);
1701
1702 // Perform pooling
1703 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1704 // Get power of 2 in case of l2 pooling
1705 if(pooling_type == PoolingType::L2)
1706 {
1707 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1708 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1709 }
1710 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1711 for(int i = 1; i < pool_size; ++i)
1712 {
1713 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1714 // Get power of 2 in case of l2 pooling
1715 if(pooling_type == PoolingType::L2)
1716 {
1717 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1718 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1719 }
1720 sum_data = vaddq_f32(sum_data, data.val[0]);
1721 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1722 }
1723 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1724 res = vmul_f32(vpadd_f32(res, res), scale_v);
1725 }
1726 else
1727 {
1728 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1729 for(int i = 1; i < pool_size; ++i)
1730 {
1731 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1732 max_data = vmax2q_f32(max_data, data);
1733 }
1734 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1735 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1736 res = vpmax_f32(res, res);
1737 }
1738 final_res = vget_lane_f32(res, 0);
1739
1740 // Calculate square-root in case of l2 pooling
1741 if(pooling_type == PoolingType::L2)
1742 {
1743 final_res = sqrt(final_res);
1744 }
1745
1746 // Store result
1747 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1748 },
1749 input, output);
1750}
1751
1752void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001753{
1754 Iterator input(_input, window_input);
1755 Iterator output(_output, window);
1756
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001757 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1758 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1759 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1760 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1761 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1762 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001763 int pool_stride_x = 0;
1764 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001765 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001766 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1767 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1768
1769 float32x4_t vres;
1770
1771 execute_window_loop(window, [&](const Coordinates & id)
1772 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001773 const int idx_width = id.y() * pool_stride_x;
1774 const int idx_height = id.z() * pool_stride_y;
1775 const int pool_limit_y = pool_pad_top - idx_height;
1776 const int pool_limit_x = pool_pad_left - idx_width;
1777
1778 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1779 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1780 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1781 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1782
Michalis Spyrou57dac842018-03-01 16:03:50 +00001783 if(pooling_type != PoolingType::MAX)
1784 {
1785 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001786 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1787 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001788 const float32x4_t scale_v = vdupq_n_f32(scale);
1789
1790 // Perform pooling
1791 vres = vdupq_n_f32(0.0f);
1792
Michalis Spyrouced25572018-10-01 16:26:20 +01001793 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001794 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001795 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001796 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001797 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1798 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001799
1800 // Get power of 2 in case of l2 pooling and accumulate
1801 if(pooling_type == PoolingType::L2)
1802 {
1803 vres = vmlaq_f32(vres, data, data);
1804 }
1805 else
1806 {
1807 vres = vaddq_f32(vres, data);
1808 }
1809 }
1810 }
1811 // Divide by scale
1812 vres = vmulq_f32(vres, scale_v);
1813 }
1814 else
1815 {
1816 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001817 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001818 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001819 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001820 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001821 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1822 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001823 vres = vmaxq_f32(vres, data);
1824 }
1825 }
1826 }
1827
1828 // Calculate square-root in case of l2 pooling
1829 if(pooling_type == PoolingType::L2)
1830 {
Georgios Pinitas27f223d2019-12-16 19:23:02 +00001831 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1832 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1833 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1834 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1835 };
1836 vres = l2_res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00001837 }
1838
1839 // Store result
1840 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1841 },
1842 input, output);
1843}
1844
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001845template <typename T>
1846void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001847{
1848 Iterator input(_input, window_input);
1849 Iterator output(_output, window);
1850
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001851 /** NEON vector types */
1852 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1853 using q16_t = typename wrapper::traits::promote_t<T>;
1854 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1855 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1856 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1857
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001858 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1859 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1860 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1861 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1862 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1863 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001864 int pool_stride_x = 0;
1865 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001866 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001867 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1868 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001869
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001870 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1871 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1872
Georgios Pinitas55186712018-01-08 17:37:12 +00001873 execute_window_loop(window, [&](const Coordinates & id)
1874 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001875 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00001876
1877 if(pooling_type != PoolingType::MAX)
1878 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001879 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1880 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001881
1882 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001883 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001884
1885 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001886 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001887 {
1888 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001889 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001890 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001891 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1892 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001893
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001894 const q16x8_t data_q16 = wrapper::vmovl(data);
1895 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001896 }
1897
1898 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001899 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001900 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001901 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1902 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001903 sres += data;
1904 }
1905 }
1906
1907 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001908 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
1909 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00001910
1911 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001912 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00001913 }
1914 else
1915 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001916 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00001917
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001918 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001919 {
1920 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001921 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001922 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001923 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1924 (_input->info()->strides_in_bytes().y())));
1925 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001926 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001927 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001928 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001929 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001930 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1931 (_input->info()->strides_in_bytes().y())));
1932 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001933 }
1934 }
1935
1936 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001937 vres = wrapper::vpmax(vres, vres);
1938 vres = wrapper::vpmax(vres, vres);
1939 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00001940
1941 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001942 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00001943 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001944 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001945 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
1946 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00001947 },
1948 input, output);
1949}
1950
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001951template <typename T>
1952void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001953{
1954 Iterator input(_input, window_input);
1955 Iterator output(_output, window);
1956
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001957 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1958 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1959 using q16_t = typename wrapper::traits::promote_t<T>;
1960 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1961 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1962 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1963
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001964 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1965 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1966 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1967 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1968 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1969 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001970
1971 int pool_stride_x = 0;
1972 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001973 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001974 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1975 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1976
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001977 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
1978 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
1979 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00001980
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00001981 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001982 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
1983 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00001984 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001985
1986 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1987 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1988 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1989
Michalis Spyrou57dac842018-03-01 16:03:50 +00001990 execute_window_loop(window, [&](const Coordinates & id)
1991 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001992 const int idx_width = id.y() * pool_stride_x;
1993 const int idx_height = id.z() * pool_stride_y;
1994 const int pool_limit_y = pool_pad_top - idx_height;
1995 const int pool_limit_x = pool_pad_left - idx_width;
1996
1997 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1998 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1999 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2000 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2001
Michalis Spyrou57dac842018-03-01 16:03:50 +00002002 if(pooling_type != PoolingType::MAX)
2003 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002004 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2005 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2006 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2007 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002008
2009 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002010 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2011 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002012
2013 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01002014 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002015 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002016 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002017 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002018 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2019 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002020
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002021 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2022 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2023 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2024 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2025 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2026 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002027 }
2028 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002029
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002030 if(input_qinfo != output_qinfo)
2031 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002032 const float32x4x4_t vres =
2033 {
2034 {
2035 vcvtq_f32_q32(vres1),
2036 vcvtq_f32_q32(vres2),
2037 vcvtq_f32_q32(vres3),
2038 vcvtq_f32_q32(vres4),
2039 }
2040 };
2041 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2042 // Store result
2043 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
2044 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002045 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002046 else
2047 {
2048 const float32x4_t scale_v = vdupq_n_f32(scale);
2049 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2050 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2051 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2052 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2053 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002054
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002055 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2056 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2057 // Store result
2058 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
2059 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
2060 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002061 }
2062 else
2063 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002064 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002065
Michalis Spyrouced25572018-10-01 16:26:20 +01002066 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002067 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002068 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002069 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002070 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2071 (_input->info()->strides_in_bytes().z())));
2072 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002073 }
2074 }
2075
2076 // Store result
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002077 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002078 }
2079 },
2080 input, output);
2081}
2082
morgolockcc1f6c92020-03-24 09:26:48 +00002083Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002084{
2085 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2086
2087 unsigned int pooled_w = 0;
2088 unsigned int pooled_h = 0;
2089 unsigned int num_elems_processed_per_iteration = 0;
2090 BorderSize border_size(0);
2091
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002092 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002093 unsigned int pool_size_x = 0;
2094 unsigned int pool_size_y = 0;
2095
2096 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002097 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2098 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2099 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002100
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002101 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2102 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002103
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002104 // Validate pool info before calling scaled_dimensions
2105 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002106
2107 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002108 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2109 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002110 pool_size_x,
2111 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002112 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002113
morgolockcc1f6c92020-03-24 09:26:48 +00002114 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2115 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2116 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002117 pool_size_x, pool_size_y)
2118 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002119
2120 return Status{};
2121}
2122
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002123void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002124{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002125 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002126 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2127 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2128 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2129
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002130 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2131 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2132 const unsigned int pool_size = _pool_info.pool_size.width;
2133 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002134
Michalis Spyrou57dac842018-03-01 16:03:50 +00002135 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002136 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002137 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002138 // Set step for input in x and y direction for the input
2139 unsigned int window_x_inc = 0;
2140 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002141 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002142 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002143 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002144 {
2145 window_x_inc = pool_stride_x;
2146 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2147 {
2148 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2149 }
2150 break;
2151 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002152
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002153 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002154 case DataType::F32:
2155 {
2156 window_x_inc = pool_stride_x;
2157 break;
2158 }
2159 default:
2160 {
2161 ARM_COMPUTE_ERROR("Not supported");
2162 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002163 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002164 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2165 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002166 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002167 else
2168 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002169 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002170 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2171 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2172 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002173
2174 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002175 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002176}
morgolockcc1f6c92020-03-24 09:26:48 +00002177} // namespace arm_compute