blob: d6b17534d303f167730575d84d17de7d613f8f94 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002 * Copyright (c) 2017-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Manuel Bottinib4bb8272019-12-18 18:01:27 +000041#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042#include <algorithm>
43#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010044#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010046#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047#include <string>
48#include <tuple>
49
Manuel Bottinib4bb8272019-12-18 18:01:27 +000050namespace arm_compute
51{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010052using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010053
54namespace
55{
Pablo Tello77e6c552018-12-04 15:33:49 +000056inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010057 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
58{
Michalis Spyrou57dac842018-03-01 16:03:50 +000059 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
60 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
61
62 int start_x = id[idx_width] * stride_x - pad_x;
63 int start_y = id[idx_height] * stride_y - pad_y;
64
65 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
66 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000067 if(exclude_padding)
68 {
69 start_x = std::max(0, start_x);
70 start_y = std::max(0, start_y);
71 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072 return 1.f / ((end_y - start_y) * (end_x - start_x));
73}
74
Manuel Bottinib4bb8272019-12-18 18:01:27 +000075template <typename T, typename TVec>
76inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000077 const int pool_size, const int upper_bound_w, const int upper_bound_h,
78 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
79{
80 int start_x = (id.x() + id_offset) * stride_x - pad_x;
81 int start_y = id.y() * stride_y - pad_y;
82 const int end_y = std::min(start_y + pool_size, upper_bound_h);
83 if(exclude_padding)
84 {
85 start_y = std::max(0, start_y);
86 }
87
Manuel Bottinib4bb8272019-12-18 18:01:27 +000088 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000089 {
90 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000091 wrapper::vgetlane(v, 0),
92 wrapper::vgetlane(v, 1),
93 wrapper::vgetlane(v, 2),
94 wrapper::vgetlane(v, 3),
95 wrapper::vgetlane(v, 4),
96 wrapper::vgetlane(v, 5),
97 wrapper::vgetlane(v, 6),
98 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +000099 }
100 };
101
102 for(auto &el : elems)
103 {
104 int c_start_x = start_x;
105 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
106 if(exclude_padding)
107 {
108 c_start_x = std::max(0, c_start_x);
109 }
110 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
111 el *= scale;
112 start_x += step * stride_x;
113 }
114
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000115 v = wrapper::vsetlane(elems[0], v, 0);
116 v = wrapper::vsetlane(elems[1], v, 1);
117 v = wrapper::vsetlane(elems[2], v, 2);
118 v = wrapper::vsetlane(elems[3], v, 3);
119 v = wrapper::vsetlane(elems[4], v, 4);
120 v = wrapper::vsetlane(elems[5], v, 5);
121 v = wrapper::vsetlane(elems[6], v, 6);
122 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000123}
124
morgolockcc1f6c92020-03-24 09:26:48 +0000125Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
126 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100127{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000128 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100129
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000130 int pool_stride_x = 0;
131 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000132 PoolingType pool_type = pool_info.pool_type;
133 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100135
Anthony Barbiereaefd002018-07-20 17:49:35 +0100136 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000137 if(indices)
138 {
morgolock37722d92020-04-09 14:17:48 +0100139 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
morgolockcc1f6c92020-03-24 09:26:48 +0000140 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
141 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
142 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000144 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000145 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
146 && (input->data_layout() == DataLayout::NHWC),
147 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000148
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000149 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100150 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000151 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000152 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
153 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
154 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000155
156 if(indices)
157 {
158 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
morgolockcc1f6c92020-03-24 09:26:48 +0000159 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
160 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
161 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100162 }
163
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000164 return Status{};
165}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100166
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000167Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000168{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000169 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
170 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000171
172 return Status{};
173}
174
morgolockcc1f6c92020-03-24 09:26:48 +0000175std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
176 unsigned int &num_elems_processed_per_iteration,
177 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000178 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000179{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100180 // Output auto inizialitation if not yet initialized
181 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000182 if(indices)
183 {
184 // Indices auto inizialitation if not yet initialized
morgolocke383c352020-04-03 16:57:46 +0100185 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
186 pool_info)))
187 .set_data_type(DataType::U32) /* we store the offset to the element */);
morgolockcc1f6c92020-03-24 09:26:48 +0000188 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000189 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000190 unsigned int num_elems_read_per_iteration = 0;
191 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000192 int pool_stride_x = 0;
193 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000194 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
195 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
196 const int input_width = input->dimension(idx_width);
197 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000198 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000199 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000200 const int pool_pad_right = pad_stride_info.pad_right();
201 const int pool_pad_top = pad_stride_info.pad_top();
202 const int pool_pad_left = pad_stride_info.pad_left();
203 const int pool_pad_bottom = pad_stride_info.pad_bottom();
204 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000205
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000206 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000207 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
208 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000209 pool_size_x,
210 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000211 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100212
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000213 //If it's not squared and optimized will be executed the MxN
214 num_elems_read_per_iteration = 1;
215 num_elems_processed_per_iteration = 1;
216 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217
Michalis Spyrou57dac842018-03-01 16:03:50 +0000218 const bool is_nhwc = data_layout == DataLayout::NHWC;
219
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000220 if(is_square)
221 {
222 switch(input->data_type())
223 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000224 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000225 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000226 if(is_nhwc)
227 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100228 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000229 break;
230 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000231 switch(pool_size_x)
232 {
233 case 2:
234 num_elems_read_per_iteration = 16;
235 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
236 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
237 break;
238 case 3:
239 num_elems_read_per_iteration = 16;
240 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
241 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
242 break;
243 default:
244 break;
245 }
246 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000247#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
248 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000249 if(is_nhwc)
250 {
251 num_elems_processed_per_iteration = 8;
252 break;
253 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000254 switch(pool_size_x)
255 {
256 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000257 case 3:
258 num_elems_read_per_iteration = 4;
259 num_elems_processed_per_iteration = 1;
260 num_elems_horizontal_window = 1;
261 break;
262 default:
263 break;
264 }
265 break;
266#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
267 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000268 if(is_nhwc)
269 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100270 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000271 break;
272 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000273 switch(pool_size_x)
274 {
275 case 2:
276 num_elems_read_per_iteration = 2;
277 break;
278 case 3:
279 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
280 break;
281 case 7:
282 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
283 break;
284 default:
285 break;
286 }
287 num_elems_processed_per_iteration = 1;
288 num_elems_horizontal_window = 1;
289 break;
290 default:
291 ARM_COMPUTE_ERROR("Element size not supported");
292 break;
293 }
294 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000295 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000296 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000297 if(is_nhwc)
298 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100299 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000300 }
301 }
302
303 bool window_changed = false;
304 Window win{};
305 if(data_layout == DataLayout::NCHW)
306 {
307 // Number of iterations in X dimension
308 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000309 // Upper limit for the number of right/bottom border elements that are accessed
310 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
311 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000312 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
313 border_size.right = std::max(upper_bound_w, pool_pad_right);
314 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000315 TensorShape output_shape{ input->tensor_shape() };
316 output_shape.set(0, pooled_w);
317 output_shape.set(1, pooled_h);
318 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000319 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000320 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000321 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000322 if(indices)
323 {
324 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
325 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
326 }
327 else
328 {
329 window_changed = update_window_and_padding(win, input_access, output_access);
330 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000331 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
332 }
333 else
334 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000335 TensorShape output_shape{ input->tensor_shape() };
336 output_shape.set(1, pooled_w);
337 output_shape.set(2, pooled_h);
338 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000339 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
340 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000341 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
morgolockcc1f6c92020-03-24 09:26:48 +0000342 if(indices)
343 {
344 AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
345 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
346 }
347 else
348 {
349 window_changed = update_window_and_padding(win, input_access, output_access);
350 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000351 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000352 }
353
354 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
355 return std::make_pair(err, win);
356}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000357
358template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000359inline T vcvtq_q32_f32(float32x4_t values);
360
361template <>
362inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
363{
364 return vcvtq_u32_f32(values);
365}
366
367template <>
368inline int32x4_t vcvtq_q32_f32(float32x4_t values)
369{
370 return vcvtq_s32_f32(values);
371}
372
373template <typename T>
374inline float32x4_t vcvtq_f32_q32(T values);
375
376template <>
377inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
378{
379 return vcvtq_f32_u32(values);
380}
381
382template <>
383inline float32x4_t vcvtq_f32_q32(int32x4_t values)
384{
385 return vcvtq_f32_s32(values);
386}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000387
388template <typename Tout>
389inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
390
391template <>
392inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
393{
394 const float new_scale = quant_rescale / scale_pooling;
395 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
396}
397
398template <>
399inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
400{
401 const float new_scale = quant_rescale / scale_pooling;
402 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
403}
404
405template <typename Tin, typename Tout>
406inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
407
408template <>
409inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
410{
411 const float32x4x4_t acc =
412 {
413 {
414 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
415 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
416 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
417 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
418 }
419 };
420 return vquantize(acc, requant_qinfo);
421}
422
423template <>
424inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
425{
426 const float32x4x4_t acc =
427 {
428 {
429 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
430 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
431 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
432 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
433 }
434 };
435 return vquantize_signed(acc, requant_qinfo);
436}
437
438template <typename T>
439inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
440
441template <>
442inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
443{
444 const float32x4x2_t acc =
445 {
446 {
447 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
448 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
449 }
450 };
451 return vquantize(acc, requant_qinfo);
452}
453
454template <>
455inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
456{
457 const float32x4x2_t acc =
458 {
459 {
460 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
461 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
462 }
463 };
464 return vquantize_signed(acc, requant_qinfo);
465}
466
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000467} // namespace
468
469NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000470 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000471{
472}
473
474BorderSize NEPoolingLayerKernel::border_size() const
475{
476 return _border_size;
477}
478
morgolockcc1f6c92020-03-24 09:26:48 +0000479void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000480{
481 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000482 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
483 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000484 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000485
486 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000487 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
488 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
489 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000490
491 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000492 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000493 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
494 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000495
496 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000497 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000498
499 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100500 unsigned int pooled_w;
501 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000502 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
503 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000504 pool_size.x(),
505 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000506 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000507
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000508 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000509 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100510
511 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000512 _input = input;
513 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000514 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000515 _pool_info = pool_info;
516 _data_layout = input->info()->data_layout();
517 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100518
Georgios Pinitas55186712018-01-08 17:37:12 +0000519 // Get data type
520 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000521 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000522
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100523 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000524 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000525 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000526 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000527 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100528 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000529 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000530 }
531 else
532 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000533 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000534 }
535 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000536 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000537 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000538 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000539 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000540 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000541 }
542 else
543 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000544 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000545 }
546 }
547 else
548 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000549 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000550 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000551 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000552 }
553 else
554 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000555 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
556 }
557 }
558 }
559 else if(data_type == DataType::QASYMM8_SIGNED)
560 {
561 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
562 {
563 if(is_nchw)
564 {
565 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
566 }
567 else
568 {
569 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
570 }
571 }
572 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
573 {
574 if(is_nchw)
575 {
576 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
577 }
578 else
579 {
580 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
581 }
582 }
583 else
584 {
585 if(is_nchw)
586 {
587 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
588 }
589 else
590 {
591 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000592 }
593 }
594 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000595 else if(data_type == DataType::F16)
596 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000597 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000598 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000599 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000600 {
601 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000602 {
603 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000604 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000605 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000606 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000607 else
608 {
609 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
610 }
611 }
612 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000613 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000614 {
615 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000616 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000617 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000618 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000619 else
620 {
621 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
622 }
623 }
624 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000625 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000626 {
627 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000628 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000629 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
630 }
631 else
632 {
633 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000634 }
635 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000636 }
637 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000638 }
639 }
640 else
641 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000642 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000643 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000644 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
645 }
646 else
647 {
648 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000649 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000650 }
651 }
652 else if(data_type == DataType::F32)
653 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000654 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000655 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000656 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000657 {
658 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000659 {
660 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000661 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000662 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
663 }
664 else
665 {
666 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000667 }
668 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000669 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000670 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000671 {
672 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000673 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000674 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
675 }
676 else
677 {
678 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000679 }
680 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000681 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000682 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000683 {
684 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000685 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000686 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
687 }
688 else
689 {
690 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000691 }
692 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000693 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000694 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000695 {
696 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000697 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000698 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
699 }
700 else
701 {
702 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000703 }
704 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000705 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000706 }
707 }
708 else
709 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000710 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000711 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000712 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
713 }
714 else
715 {
716 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000717 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000718 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100719 }
720
721 // Configure kernel window
morgolockcc1f6c92020-03-24 09:26:48 +0000722 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
723 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000724 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
725 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100726}
727
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000728template <typename T>
729void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000730{
731 Iterator input(_input, window_input);
732 Iterator output(_output, window);
733
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000734 /** NEON vector types */
735 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
736 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
737 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
738 using q16_t = typename wrapper::traits::promote_t<T>;
739 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
740 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
741 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
742
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000743 constexpr int pool_size = 2;
744 int pool_stride_x = 0;
745 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000746 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
747 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
748 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
749 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
750 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000751 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
752 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000753
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000754 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
755 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000756
757 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
758
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100759 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
760 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
761 const bool have_different_qinfo = input_qinfo != output_qinfo;
762
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000763 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
764 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
765 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
766
Georgios Pinitas55186712018-01-08 17:37:12 +0000767 execute_window_loop(window, [&](const Coordinates & id)
768 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000769 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
770 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
771 q8x8_t lower_res = {};
772 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000773
774 if(pooling_type != PoolingType::MAX)
775 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000776 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
777 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000778
779 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000780 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000781 {
782 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000783 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
784 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000785 }
786 };
787
788 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000789 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
790 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000791
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000792 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000793
794 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000795 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
796 pool_size, upper_bound_w, upper_bound_h,
797 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
798 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000799
800 // Compute upper result for stride_x == 1
801 if(pool_stride_x == 1)
802 {
803 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000804 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000805 {
806 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000807 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
808 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000809 }
810 };
811
812 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000813 q16x8_t res_upper = wrapper::vcombine(
814 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
815 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000816
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000817 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000818 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
819 pool_size, upper_bound_w, upper_bound_h,
820 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
821 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000822 }
823 }
824 else
825 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000826 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
827 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000828 if(pool_stride_x == 1)
829 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000830 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
831 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000832 }
833 }
834
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100835 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100836 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000837 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000838 lower_res = wrapper::vgetlow(requantized_output);
839 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100840 }
841
Georgios Pinitas55186712018-01-08 17:37:12 +0000842 // Store result
843 if(pool_stride_x == 1)
844 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000845 const q8x8x2_t res = { { lower_res, upper_res } };
846 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000847 }
848 else
849 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000850 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000851 }
852 },
853 input, output);
854}
855
Pablo Tello77e6c552018-12-04 15:33:49 +0000856void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100857{
Pablo Tello77e6c552018-12-04 15:33:49 +0000858 ARM_COMPUTE_UNUSED(pooling_type);
859 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000860#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100861 Iterator input(_input, window_input);
862 Iterator output(_output, window);
863
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000864 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000865 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
866 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
867 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
868 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000869 int pool_stride_x = 0;
870 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000871 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000872 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
873 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100874
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000875 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
876 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
877 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100878
879 execute_window_loop(window, [&](const Coordinates & id)
880 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100881 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
882 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
883 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
884 float16x4_t res = {};
885
886 // Get power of 2 in case of l2 pooling
887 if(pooling_type == PoolingType::L2)
888 {
889 top_data = vmul_f16(top_data, top_data);
890 middle_data = vmul_f16(middle_data, middle_data);
891 bottom_data = vmul_f16(bottom_data, bottom_data);
892 }
893
894 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100895 {
896 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000897 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100898 const float16x4_t scale_v = vdup_n_f16(scale);
899 // Perform pooling
900 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
901 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
902 res = vmul_f16(vpadd_f16(res, res), scale_v);
903 }
904 else
905 {
906 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
907 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
908 res = vpmax_f16(res, res);
909 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100910
911 // Calculate square-root in case of l2 pooling
912 if(pooling_type == PoolingType::L2)
913 {
914 res = vinv_f16(vinvsqrt_f16(res));
915 }
916
Pablo Tello0c34fe22017-06-26 17:17:42 +0100917 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
918 },
919 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000920#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100921 ARM_COMPUTE_UNUSED(window_input);
922 ARM_COMPUTE_UNUSED(window);
923 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000924#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100925}
926
Pablo Tello77e6c552018-12-04 15:33:49 +0000927void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100928{
Pablo Tello77e6c552018-12-04 15:33:49 +0000929 ARM_COMPUTE_UNUSED(pooling_type);
930 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000931#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100932 Iterator input(_input, window_input);
933 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000934 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000935 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
936 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
937 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
938 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000939 int pool_stride_x, pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000940 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000941 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
942 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100943
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000944 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
945 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100946
947 execute_window_loop(window, [&](const Coordinates & id)
948 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100949 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
950 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
951 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100952
Georgios Pinitascdf51452017-08-31 14:21:36 +0100953 // Get power of 2 in case of l2 pooling
954 if(pooling_type == PoolingType::L2)
955 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100956 top_data = vmul_f16(top_data, top_data);
957 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100958 }
959
960 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100961 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000962 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100963 const float16x4_t scale_v = vdup_n_f16(scale);
964
965 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
966 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100967 }
968 else
969 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100970 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
971 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100972 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100973
974 // Calculate square-root in case of l2 pooling
975 if(pooling_type == PoolingType::L2)
976 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100977 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100978 }
979
980 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100981 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100982 },
983 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000984#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100985 ARM_COMPUTE_UNUSED(window_input);
986 ARM_COMPUTE_UNUSED(window);
987 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000988#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100989}
990
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000991template <typename T>
992void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000993{
994 Iterator input(_input, window_input);
995 Iterator output(_output, window);
996
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000997 /** NEON vector types */
998 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
999 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1000 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1001 using q16_t = typename wrapper::traits::promote_t<T>;
1002 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1003 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1004
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001005 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001006 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1007 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1008 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1009 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001010 int pool_stride_x = 0;
1011 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001012 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001013 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1014 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001015
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001016 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1017 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001018
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001019 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1020 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1021 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1022
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001023 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1024 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1025 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001026
1027 execute_window_loop(window, [&](const Coordinates & id)
1028 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001029 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1030 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1031 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1032 q8x8_t fres = {};
1033 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001034
1035 if(pooling_type == PoolingType::AVG)
1036 {
1037 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001038 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1039 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1040 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001041
1042 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001043 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001044 {
1045 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001046 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1047 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001048 }
1049 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001050 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001051 {
1052 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001053 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1054 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001055 }
1056 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001057 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001058 {
1059 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001060 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1061 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001062 }
1063 };
1064 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001065 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001066 {
1067 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001068 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1069 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001070 }
1071 };
1072 if(pool_stride_x == 2)
1073 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001074 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001075 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001076 wrapper::vgetlane(final_sum.val[0], 0),
1077 wrapper::vgetlane(final_sum.val[0], 2),
1078 wrapper::vgetlane(final_sum.val[0], 4),
1079 wrapper::vgetlane(final_sum.val[0], 6),
1080 wrapper::vgetlane(final_sum.val[1], 0),
1081 wrapper::vgetlane(final_sum.val[1], 2),
1082 wrapper::vgetlane(final_sum.val[1], 4),
1083 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001084 };
1085
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001086 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1087 pool_size, upper_bound_w, upper_bound_h,
1088 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1089 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001090 }
1091 else
1092 {
1093 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001094 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1095 pool_size, upper_bound_w, upper_bound_h,
1096 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001097 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001098 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1099 pool_size, upper_bound_w, upper_bound_h,
1100 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1101 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001102 }
1103 }
1104 else
1105 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001106 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1107 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1108 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1109 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001110
1111 if(pool_stride_x == 2)
1112 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001113 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1114 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1115 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001116 }
1117 else
1118 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001119 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001120 }
1121 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001122
1123 // Store result
1124 if(pool_stride_x == 1)
1125 {
1126 if(input_qinfo != output_qinfo)
1127 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001128 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001129 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001130 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001131 }
1132 else
1133 {
1134 if(input_qinfo != output_qinfo)
1135 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001136 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001137 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001138 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001139 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001140 },
1141 input, output);
1142}
1143
Pablo Tello77e6c552018-12-04 15:33:49 +00001144void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001145{
Pablo Tello77e6c552018-12-04 15:33:49 +00001146 ARM_COMPUTE_UNUSED(pooling_type);
1147 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001148#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1149 Iterator input(_input, window_input);
1150 Iterator output(_output, window);
1151
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001152 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1153 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1154 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1155 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1156 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1157 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001158 int pool_stride_x = 0;
1159 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001160 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001161 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1162 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1163
1164 execute_window_loop(window, [&](const Coordinates & id)
1165 {
1166 float16_t res = 0.0f;
1167 float16x8_t vres = vdupq_n_f16(0.0f);
1168
1169 if(pooling_type != PoolingType::MAX)
1170 {
1171 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001172 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001173
1174 // Perform pooling
1175
1176 for(int y = 0; y < pool_size_y; ++y)
1177 {
1178 int x = 0;
1179 for(; x <= (pool_size_x - 8); x += 8)
1180 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001181 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1182 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001183
1184 // Get power of 2 in case of l2 pooling and accumulate
1185 if(pooling_type == PoolingType::L2)
1186 {
1187 vres = vaddq_f16(vres, vmulq_f16(data, data));
1188 }
1189 else
1190 {
1191 vres = vaddq_f16(vres, data);
1192 }
1193 }
1194
1195 // Leftover for loop
1196 for(; x < pool_size_x; ++x)
1197 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001198 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1199 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001200
1201 // Get power of 2 in case of l2 pooling
1202 if(pooling_type == PoolingType::L2)
1203 {
1204 data *= data;
1205 }
1206
1207 res += data;
1208 }
1209 }
1210
1211 // Reduction
1212 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1213 res += vget_lane_f16(tmp, 0);
1214 res += vget_lane_f16(tmp, 1);
1215 res += vget_lane_f16(tmp, 2);
1216 res += vget_lane_f16(tmp, 3);
1217
1218 // Divide by scale
1219 res *= scale;
1220 }
1221 else
1222 {
1223 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1224 res = std::numeric_limits<float>::lowest();
1225
1226 for(int y = 0; y < pool_size_y; ++y)
1227 {
1228 int x = 0;
1229 for(; x <= (pool_size_x - 8); x += 8)
1230 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001231 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1232 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001233 vres = vmaxq_f16(vres, data);
1234 }
1235
1236 // Leftover for loop
1237 for(; x < pool_size_x; ++x)
1238 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001239 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1240 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1241 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001242 }
1243 }
1244
1245 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1246 res = std::max(res, vget_lane_f16(tmp, 0));
1247 res = std::max(res, vget_lane_f16(tmp, 1));
1248 res = std::max(res, vget_lane_f16(tmp, 2));
1249 res = std::max(res, vget_lane_f16(tmp, 3));
1250 }
1251
1252 // Calculate square-root in case of l2 pooling
1253 if(pooling_type == PoolingType::L2)
1254 {
1255 res = std::sqrt(res);
1256 }
1257
1258 // Store result
1259 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1260 },
1261 input, output);
1262
1263#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1264 ARM_COMPUTE_UNUSED(window_input);
1265 ARM_COMPUTE_UNUSED(window);
1266 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1267#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1268}
1269
Pablo Tello77e6c552018-12-04 15:33:49 +00001270void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001271{
Pablo Tello77e6c552018-12-04 15:33:49 +00001272 ARM_COMPUTE_UNUSED(pooling_type);
1273 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001274#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1275 Iterator input(_input, window_input);
1276 Iterator output(_output, window);
1277
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001278 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1279 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1280 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1281 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1282 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1283 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001284 int pool_stride_x = 0;
1285 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001286 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001287 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1288 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1289
1290 float16x8_t vres;
1291
1292 execute_window_loop(window, [&](const Coordinates & id)
1293 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001294 const int idx_width = id.y() * pool_stride_x;
1295 const int idx_height = id.z() * pool_stride_y;
1296 const int pool_limit_y = pool_pad_top - idx_height;
1297 const int pool_limit_x = pool_pad_left - idx_width;
1298
1299 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1300 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1301 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1302 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1303
Michalis Spyrou57dac842018-03-01 16:03:50 +00001304 if(pooling_type != PoolingType::MAX)
1305 {
1306 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001307 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1308 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001309 const float16x8_t scale_v = vdupq_n_f16(scale);
1310
1311 // Perform pooling
1312 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001313 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001314 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001315 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001316 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001317 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1318 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001319
1320 // Get power of 2 in case of l2 pooling and accumulate
1321 if(pooling_type == PoolingType::L2)
1322 {
1323 vres = vaddq_f16(vres, vmulq_f16(data, data));
1324 }
1325 else
1326 {
1327 vres = vaddq_f16(vres, data);
1328 }
1329 }
1330 }
1331 // Divide by scale
1332 vres = vmulq_f16(vres, scale_v);
1333 }
1334 else
1335 {
1336 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001337
1338 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001339 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001340 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001341 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001342 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1343 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001344 vres = vmaxq_f16(vres, data);
1345 }
1346 }
1347 }
1348
1349 // Calculate square-root in case of l2 pooling
1350 if(pooling_type == PoolingType::L2)
1351 {
1352 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1353 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1354 }
1355
1356 // Store result
1357 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1358 },
1359 input, output);
1360
1361#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1362 ARM_COMPUTE_UNUSED(window_input);
1363 ARM_COMPUTE_UNUSED(window);
1364 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1365#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1366}
1367
Pablo Tello77e6c552018-12-04 15:33:49 +00001368void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001369{
1370 Iterator input(_input, window_input);
1371 Iterator output(_output, window);
1372
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001373 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1374 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1375 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1376 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1377 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1378 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001379 int pool_stride_x = 0;
1380 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001381 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001382 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1383 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001384
1385 execute_window_loop(window, [&](const Coordinates & id)
1386 {
1387 float res = 0.0f;
1388
1389 if(pooling_type != PoolingType::MAX)
1390 {
1391 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001392 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001393
1394 // Perform pooling
1395 float32x4_t vres = vdupq_n_f32(0.0f);
1396
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001397 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001398 {
1399 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001400 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001401 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001402 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1403 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001404
1405 // Get power of 2 in case of l2 pooling and accumulate
1406 if(pooling_type == PoolingType::L2)
1407 {
1408 vres = vmlaq_f32(vres, data, data);
1409 }
1410 else
1411 {
1412 vres = vaddq_f32(vres, data);
1413 }
1414 }
1415
1416 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001417 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001418 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001419 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1420 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001421
1422 // Get power of 2 in case of l2 pooling
1423 if(pooling_type == PoolingType::L2)
1424 {
1425 data *= data;
1426 }
1427
1428 res += data;
1429 }
1430 }
1431
1432#if defined(__aarch64__)
1433 // Reduction operation available on 64 bit architectures only
1434 res += vaddvq_f32(vres);
1435#else // __aarch64__
1436 // Reduction
1437 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1438 tmp = vpadd_f32(tmp, tmp);
1439
1440 res += vget_lane_f32(tmp, 0);
1441#endif // __aarch64__
1442 // Divide by scale
1443 res *= scale;
1444 }
1445 else
1446 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001447 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1448 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001449
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001450 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001451 {
1452 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001453 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001454 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001455 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1456 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001457 vres = vmaxq_f32(vres, data);
1458 }
1459
1460 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001461 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001462 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001463 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1464 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001465 res = std::max(res, data);
1466 }
1467 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001468#if defined(__aarch64__)
1469 // Reduction operation available on 64 bit architectures only
1470 res = std::max(vmaxvq_f32(vres), res);
1471#else // __aarch64__
1472 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1473 tmp = vpmax_f32(tmp, tmp);
1474
1475 res = std::max(res, vget_lane_f32(tmp, 0));
1476#endif // __aarch64__
1477 }
1478
1479 // Calculate square-root in case of l2 pooling
1480 if(pooling_type == PoolingType::L2)
1481 {
1482 res = std::sqrt(res);
1483 }
1484
1485 // Store result
1486 *(reinterpret_cast<float *>(output.ptr())) = res;
1487 },
1488 input, output);
1489}
1490
morgolock37722d92020-04-09 14:17:48 +01001491inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
1492{
1493 const int pad_left = info.padding().left;
1494 const int pad_right = info.padding().right;
1495 const int pad_top = info.padding().top;
1496 const int pad_bottom = info.padding().bottom;
1497 const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
1498 const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
1499 const int pad_horiz = pad_left + pad_right;
1500 const int pad_vert = pad_top + pad_bottom;
1501
1502 if(info.data_layout() == DataLayout::NCHW)
1503 {
1504 const uint32_t offset_base = padded_offset
1505 - sizeof(float) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
1506 - pad_top * sizeof(float) /* top padding */
1507 - sizeof(float) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
1508 - in_stride_w * id[3];
1509
1510 return offset_base;
1511 }
1512 else
1513 {
1514 const uint32_t offset_base = padded_offset
1515 - sizeof(float) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
1516 - pad_top * sizeof(float) // top padding
1517 - sizeof(float) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
1518 - in_stride_w * id[3];
1519
1520 return offset_base;
1521 }
1522}
1523
morgolockcc1f6c92020-03-24 09:26:48 +00001524void NEPoolingLayerKernel::pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window)
Pablo Tello77e6c552018-12-04 15:33:49 +00001525{
morgolockcc1f6c92020-03-24 09:26:48 +00001526 Iterator input(_input, window_input);
1527 Iterator output(_output, window);
1528 Iterator indices(_indices, window);
1529 int final_index = 0;
1530 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1531 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1532 int pool_stride_x = 0;
1533 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001534 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001535 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1536 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
morgolock37722d92020-04-09 14:17:48 +01001537 const int pad_left = _input->info()->padding().left;
1538 const int pad_right = _input->info()->padding().right;
1539 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1540 execute_window_loop(window, [&](const Coordinates & id)
Pablo Tello77e6c552018-12-04 15:33:49 +00001541 {
morgolockcc1f6c92020-03-24 09:26:48 +00001542 const auto input_offset_top = input_top_ptr + input.offset();
1543 const auto input_offset_bottom = input_bottom_ptr + input.offset();
1544 const auto in_top_ptr = reinterpret_cast<const float *>(input_offset_top);
1545 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_offset_bottom);
1546 float32x2_t top_data = vld1_f32(in_top_ptr);
1547 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1548 float32x2_t res = {};
1549 float final_res = 0;
1550 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1551 res = vpmax_f32(max_data, max_data);
1552 final_res = vget_lane_f32(res, 0);
Pablo Tello77e6c552018-12-04 15:33:49 +00001553 // Store result
1554 *(reinterpret_cast<float *>(output.ptr())) = final_res;
morgolock37722d92020-04-09 14:17:48 +01001555 const uint32_t offset_base = offset_no_padding(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1556 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(float));
1557 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(float) - pad_right - pad_left;
morgolockcc1f6c92020-03-24 09:26:48 +00001558 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
1559 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
1560 const uint32x2_t tmp_indices = vbsl_u32(vcgt_f32(top_data, bottom_data), voffset_top, voffset_bottom);
1561 final_index = vget_lane_u32(vbsl_u32(vcgt_f32(max_data, vrev64_f32(max_data)), tmp_indices, vrev64_u32(tmp_indices)), 0);
1562 *(reinterpret_cast<int *>(indices.ptr())) = final_index;
Pablo Tello77e6c552018-12-04 15:33:49 +00001563 },
morgolockcc1f6c92020-03-24 09:26:48 +00001564 input, output, indices);
1565}
1566
1567void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1568 bool exclude_padding)
1569{
1570 if(pooling_type == PoolingType::MAX && _indices)
1571 {
1572 pooling2_f32_nchw_maxpool_indices(window_input, window);
1573 }
1574 else
1575 {
1576 Iterator input(_input, window_input);
1577 Iterator output(_output, window);
1578 constexpr int pool_size = 2;
1579 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1580 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1581 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1582 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1583 int pool_stride_x = 0;
1584 int pool_stride_y = 0;
1585 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1586 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1587 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1588
1589 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1590 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1591
1592 execute_window_loop(window, [&](const Coordinates & id)
1593 {
1594 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1595 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1596 float32x2_t top_data = vld1_f32(in_top_ptr);
1597 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1598 float32x2_t res = {};
1599 float final_res = 0;
1600 // Get power of 2 in case of l2 pooling
1601 if(pooling_type == PoolingType::L2)
1602 {
1603 top_data = vmul_f32(top_data, top_data);
1604 bottom_data = vmul_f32(bottom_data, bottom_data);
1605 }
1606
1607 if(pooling_type != PoolingType::MAX)
1608 {
1609 // Calculate scale
1610 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1611 const float32x2_t scale_v = vdup_n_f32(scale);
1612
1613 // Perform pooling
1614 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1615 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1616 }
1617 else
1618 {
1619 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1620 res = vpmax_f32(max_data, max_data);
1621 }
1622 final_res = vget_lane_f32(res, 0);
1623
1624 // Calculate square-root in case of l2 pooling
1625 if(pooling_type == PoolingType::L2)
1626 {
1627 final_res = sqrt(final_res);
1628 }
1629
1630 // Store result
1631 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1632 },
1633 input, output);
1634 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001635}
1636
1637void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1638{
1639 Iterator input(_input, window_input);
1640 Iterator output(_output, window);
1641
1642 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001643 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1644 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1645 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1646 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001647 int pool_stride_x = 0;
1648 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001649 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001650 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1651 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1652
1653 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1654 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1655 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1656
1657 execute_window_loop(window, [&](const Coordinates & id)
1658 {
1659 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1660 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1661 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1662 float32x2_t res = {};
1663 float final_res = 0;
1664
1665 // Get power of 2 in case of l2 pooling
1666 if(pooling_type == PoolingType::L2)
1667 {
1668 top_data = vmulq_f32(top_data, top_data);
1669 middle_data = vmulq_f32(middle_data, middle_data);
1670 bottom_data = vmulq_f32(bottom_data, bottom_data);
1671 }
1672
1673 if(pooling_type != PoolingType::MAX)
1674 {
1675 // Calculate scale
1676 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1677 const float32x2_t scale_v = vdup_n_f32(scale);
1678
1679 // Perform pooling
1680 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1681 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1682 res = vmul_f32(vpadd_f32(res, res), scale_v);
1683 }
1684 else
1685 {
1686 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1687 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1688 res = vpmax_f32(res, res);
1689 }
1690 final_res = vget_lane_f32(res, 0);
1691
1692 // Calculate square-root in case of l2 pooling
1693 if(pooling_type == PoolingType::L2)
1694 {
1695 final_res = sqrt(final_res);
1696 }
1697
1698 // Store result
1699 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1700 },
1701 input, output);
1702}
1703
1704void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1705{
1706 Iterator input(_input, window_input);
1707 Iterator output(_output, window);
1708
1709 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001710 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1711 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1712 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1713 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001714 int pool_stride_x = 0;
1715 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001716 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001717 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1718 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1719
1720 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1721 for(int i = 0; i < pool_size; ++i)
1722 {
1723 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1724 }
1725
1726 execute_window_loop(window, [&](const Coordinates & id)
1727 {
1728 float32x2_t res = {};
1729 float final_res = 0.f;
1730 if(pooling_type != PoolingType::MAX)
1731 {
1732 // Calculate scale
1733 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1734 const float32x2_t scale_v = vdup_n_f32(scale);
1735
1736 // Perform pooling
1737 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1738 // Get power of 2 in case of l2 pooling
1739 if(pooling_type == PoolingType::L2)
1740 {
1741 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1742 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1743 }
1744 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1745 for(int i = 1; i < pool_size; ++i)
1746 {
1747 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1748 // Get power of 2 in case of l2 pooling
1749 if(pooling_type == PoolingType::L2)
1750 {
1751 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1752 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1753 }
1754 sum_data = vaddq_f32(sum_data, data.val[0]);
1755 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1756 }
1757 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1758 res = vmul_f32(vpadd_f32(res, res), scale_v);
1759 }
1760 else
1761 {
1762 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1763 for(int i = 1; i < pool_size; ++i)
1764 {
1765 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1766 max_data = vmax2q_f32(max_data, data);
1767 }
1768 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1769 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1770 res = vpmax_f32(res, res);
1771 }
1772 final_res = vget_lane_f32(res, 0);
1773
1774 // Calculate square-root in case of l2 pooling
1775 if(pooling_type == PoolingType::L2)
1776 {
1777 final_res = sqrt(final_res);
1778 }
1779
1780 // Store result
1781 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1782 },
1783 input, output);
1784}
1785
1786void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001787{
morgolocke383c352020-04-03 16:57:46 +01001788 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1789 {
1790 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1791 }
1792 else
1793 {
1794 Iterator input(_input, window_input);
1795 Iterator output(_output, window);
1796
1797 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1798 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1799 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1800 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1801 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1802 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1803 int pool_stride_x = 0;
1804 int pool_stride_y = 0;
1805 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1806 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1807 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1808
1809 float32x4_t vres;
1810
1811 execute_window_loop(window, [&](const Coordinates & id)
1812 {
1813 const int idx_width = id.y() * pool_stride_x;
1814 const int idx_height = id.z() * pool_stride_y;
1815 const int pool_limit_y = pool_pad_top - idx_height;
1816 const int pool_limit_x = pool_pad_left - idx_width;
1817
1818 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1819 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1820 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1821 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1822
1823 if(pooling_type != PoolingType::MAX)
1824 {
1825 // Calculate scale
1826 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1827 pool_stride_y);
1828 const float32x4_t scale_v = vdupq_n_f32(scale);
1829
1830 // Perform pooling
1831 vres = vdupq_n_f32(0.0f);
1832
1833 for(int y = pool_start_y; y < pool_end_y; ++y)
1834 {
1835 for(int x = pool_start_x; x < pool_end_x; ++x)
1836 {
1837 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1838 (_input->info()->strides_in_bytes().z())));
1839
1840 // Get power of 2 in case of l2 pooling and accumulate
1841 if(pooling_type == PoolingType::L2)
1842 {
1843 vres = vmlaq_f32(vres, data, data);
1844 }
1845 else
1846 {
1847 vres = vaddq_f32(vres, data);
1848 }
1849 }
1850 }
1851 // Divide by scale
1852 vres = vmulq_f32(vres, scale_v);
1853 }
1854 else
1855 {
1856 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1857 for(int y = pool_start_y; y < pool_end_y; ++y)
1858 {
1859 for(int x = pool_start_x; x < pool_end_x; ++x)
1860 {
1861 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1862 (_input->info()->strides_in_bytes().z())));
1863 vres = vmaxq_f32(vres, data);
1864 }
1865 }
1866 }
1867
1868 // Calculate square-root in case of l2 pooling
1869 if(pooling_type == PoolingType::L2)
1870 {
1871 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1872 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1873 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1874 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1875 };
1876 vres = l2_res;
1877 }
1878
1879 // Store result
1880 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1881 },
1882 input, output);
1883 }
1884}
1885
1886void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1887{
Michalis Spyrou57dac842018-03-01 16:03:50 +00001888 Iterator input(_input, window_input);
1889 Iterator output(_output, window);
morgolocke383c352020-04-03 16:57:46 +01001890 Iterator indices(_indices, window);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001891
morgolocke383c352020-04-03 16:57:46 +01001892 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1893 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1894
1895 int pool_stride_x = 0;
1896 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001897 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001898
1899 float32x4_t vres;
1900
morgolocke383c352020-04-03 16:57:46 +01001901 const int pad_right = _input->info()->padding().right;
morgolocke383c352020-04-03 16:57:46 +01001902 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1903 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01001904
Michalis Spyrou57dac842018-03-01 16:03:50 +00001905 execute_window_loop(window, [&](const Coordinates & id)
1906 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001907 const int idx_width = id.y() * pool_stride_x;
1908 const int idx_height = id.z() * pool_stride_y;
1909 const int pool_limit_y = pool_pad_top - idx_height;
1910 const int pool_limit_x = pool_pad_left - idx_width;
1911
1912 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
Michalis Spyrouced25572018-10-01 16:26:20 +01001913 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
morgolocke383c352020-04-03 16:57:46 +01001914 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1915 (_input->info()->strides_in_bytes().z());
1916 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1917 (_input->info()->strides_in_bytes().z());
Michalis Spyrouced25572018-10-01 16:26:20 +01001918
morgolocke383c352020-04-03 16:57:46 +01001919 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1920 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00001921
morgolocke383c352020-04-03 16:57:46 +01001922 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1923 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00001924
morgolocke383c352020-04-03 16:57:46 +01001925 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
1926 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
1927 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
1928 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
1929 const auto v_x0 = vld1q_f32(in_x0_ptr);
1930 const auto v_x1 = vld1q_f32(in_x1_ptr);
1931 const auto v_x2 = vld1q_f32(in_x2_ptr);
1932 const auto v_x3 = vld1q_f32(in_x3_ptr);
1933 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001934 // Store result
1935 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
morgolocke383c352020-04-03 16:57:46 +01001936
morgolock37722d92020-04-09 14:17:48 +01001937 const uint32_t offset_base = offset_no_padding(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1938 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float);
1939 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
1940 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
1941 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
morgolocke383c352020-04-03 16:57:46 +01001942 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1943 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1944 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1945 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1946 const uint32x4_t tmp_indices0 = vbslq_u32(vcgtq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
1947 const uint32x4_t tmp_indices1 = vbslq_u32(vcgtq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
1948 const uint32x4_t tmp_indices2 = vbslq_u32(vcgtq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1949
1950 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indices2);
1951
Michalis Spyrou57dac842018-03-01 16:03:50 +00001952 },
morgolocke383c352020-04-03 16:57:46 +01001953 input, output, indices);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001954}
1955
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001956template <typename T>
1957void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001958{
1959 Iterator input(_input, window_input);
1960 Iterator output(_output, window);
1961
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001962 /** NEON vector types */
1963 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1964 using q16_t = typename wrapper::traits::promote_t<T>;
1965 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1966 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1967 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1968
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001969 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1970 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1971 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1972 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1973 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1974 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001975 int pool_stride_x = 0;
1976 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001977 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001978 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1979 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001980
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001981 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1982 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1983
Georgios Pinitas55186712018-01-08 17:37:12 +00001984 execute_window_loop(window, [&](const Coordinates & id)
1985 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001986 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00001987
1988 if(pooling_type != PoolingType::MAX)
1989 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001990 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1991 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001992
1993 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001994 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001995
1996 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001997 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001998 {
1999 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002000 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002001 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002002 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2003 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002004
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002005 const q16x8_t data_q16 = wrapper::vmovl(data);
2006 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00002007 }
2008
2009 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002010 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002011 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002012 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2013 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002014 sres += data;
2015 }
2016 }
2017
2018 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002019 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2020 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00002021
2022 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002023 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00002024 }
2025 else
2026 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002027 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00002028
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002029 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002030 {
2031 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002032 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002033 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002034 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2035 (_input->info()->strides_in_bytes().y())));
2036 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002037 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002038 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002039 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002040 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002041 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2042 (_input->info()->strides_in_bytes().y())));
2043 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002044 }
2045 }
2046
2047 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002048 vres = wrapper::vpmax(vres, vres);
2049 vres = wrapper::vpmax(vres, vres);
2050 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00002051
2052 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002053 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00002054 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002055 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002056 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2057 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00002058 },
2059 input, output);
2060}
2061
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002062template <typename T>
2063void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002064{
2065 Iterator input(_input, window_input);
2066 Iterator output(_output, window);
2067
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002068 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2069 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2070 using q16_t = typename wrapper::traits::promote_t<T>;
2071 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2072 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2073 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2074
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002075 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2076 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2077 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2078 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2079 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2080 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002081
2082 int pool_stride_x = 0;
2083 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002084 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002085 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2086 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2087
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002088 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2089 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2090 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00002091
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002092 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002093 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2094 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002095 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002096
2097 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2098 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2099 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2100
Michalis Spyrou57dac842018-03-01 16:03:50 +00002101 execute_window_loop(window, [&](const Coordinates & id)
2102 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002103 const int idx_width = id.y() * pool_stride_x;
2104 const int idx_height = id.z() * pool_stride_y;
2105 const int pool_limit_y = pool_pad_top - idx_height;
2106 const int pool_limit_x = pool_pad_left - idx_width;
2107
2108 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2109 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2110 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2111 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2112
Michalis Spyrou57dac842018-03-01 16:03:50 +00002113 if(pooling_type != PoolingType::MAX)
2114 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002115 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2116 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2117 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2118 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002119
2120 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002121 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2122 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002123
2124 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01002125 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002126 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002127 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002128 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002129 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2130 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002131
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002132 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2133 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2134 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2135 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2136 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2137 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002138 }
2139 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002140
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002141 if(input_qinfo != output_qinfo)
2142 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002143 const float32x4x4_t vres =
2144 {
2145 {
2146 vcvtq_f32_q32(vres1),
2147 vcvtq_f32_q32(vres2),
2148 vcvtq_f32_q32(vres3),
2149 vcvtq_f32_q32(vres4),
2150 }
2151 };
2152 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2153 // Store result
2154 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
2155 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002156 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002157 else
2158 {
2159 const float32x4_t scale_v = vdupq_n_f32(scale);
2160 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2161 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2162 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2163 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2164 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002165
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002166 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2167 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2168 // Store result
2169 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
2170 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
2171 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002172 }
2173 else
2174 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002175 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002176
Michalis Spyrouced25572018-10-01 16:26:20 +01002177 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002178 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002179 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002180 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002181 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2182 (_input->info()->strides_in_bytes().z())));
2183 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002184 }
2185 }
2186
2187 // Store result
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002188 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002189 }
2190 },
2191 input, output);
2192}
2193
morgolockcc1f6c92020-03-24 09:26:48 +00002194Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002195{
2196 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2197
2198 unsigned int pooled_w = 0;
2199 unsigned int pooled_h = 0;
2200 unsigned int num_elems_processed_per_iteration = 0;
2201 BorderSize border_size(0);
2202
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002203 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002204 unsigned int pool_size_x = 0;
2205 unsigned int pool_size_y = 0;
2206
2207 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002208 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2209 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2210 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002211
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002212 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2213 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002214
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002215 // Validate pool info before calling scaled_dimensions
2216 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002217
2218 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002219 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2220 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002221 pool_size_x,
2222 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002223 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002224
morgolockcc1f6c92020-03-24 09:26:48 +00002225 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2226 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2227 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002228 pool_size_x, pool_size_y)
2229 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002230
2231 return Status{};
2232}
2233
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002234void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002235{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002236 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002237 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2238 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2239 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2240
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002241 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2242 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2243 const unsigned int pool_size = _pool_info.pool_size.width;
2244 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002245
Michalis Spyrou57dac842018-03-01 16:03:50 +00002246 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002247 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002248 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002249 // Set step for input in x and y direction for the input
2250 unsigned int window_x_inc = 0;
2251 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002252 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002253 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002254 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002255 {
2256 window_x_inc = pool_stride_x;
2257 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2258 {
2259 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2260 }
2261 break;
2262 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002263
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002264 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002265 case DataType::F32:
2266 {
2267 window_x_inc = pool_stride_x;
2268 break;
2269 }
2270 default:
2271 {
2272 ARM_COMPUTE_ERROR("Not supported");
2273 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002274 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002275 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2276 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002277 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002278 else
2279 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002280 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002281 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2282 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2283 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002284
2285 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002286 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002287}
morgolockcc1f6c92020-03-24 09:26:48 +00002288} // namespace arm_compute