blob: 2bbc307d191ad0b48e328cd931dfd6042da09b7f [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Manuel Bottinib4bb8272019-12-18 18:01:27 +000041#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042#include <algorithm>
43#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010044#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010046#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047#include <string>
48#include <tuple>
49
Manuel Bottinib4bb8272019-12-18 18:01:27 +000050namespace arm_compute
51{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010052using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010053
54namespace
55{
Pablo Tello77e6c552018-12-04 15:33:49 +000056inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010057 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
58{
Michalis Spyrou57dac842018-03-01 16:03:50 +000059 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
60 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
61
62 int start_x = id[idx_width] * stride_x - pad_x;
63 int start_y = id[idx_height] * stride_y - pad_y;
64
65 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
66 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000067 if(exclude_padding)
68 {
69 start_x = std::max(0, start_x);
70 start_y = std::max(0, start_y);
71 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072 return 1.f / ((end_y - start_y) * (end_x - start_x));
73}
74
Manuel Bottinib4bb8272019-12-18 18:01:27 +000075template <typename T, typename TVec>
76inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000077 const int pool_size, const int upper_bound_w, const int upper_bound_h,
78 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
79{
80 int start_x = (id.x() + id_offset) * stride_x - pad_x;
81 int start_y = id.y() * stride_y - pad_y;
82 const int end_y = std::min(start_y + pool_size, upper_bound_h);
83 if(exclude_padding)
84 {
85 start_y = std::max(0, start_y);
86 }
87
Manuel Bottinib4bb8272019-12-18 18:01:27 +000088 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000089 {
90 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000091 wrapper::vgetlane(v, 0),
92 wrapper::vgetlane(v, 1),
93 wrapper::vgetlane(v, 2),
94 wrapper::vgetlane(v, 3),
95 wrapper::vgetlane(v, 4),
96 wrapper::vgetlane(v, 5),
97 wrapper::vgetlane(v, 6),
98 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +000099 }
100 };
101
102 for(auto &el : elems)
103 {
104 int c_start_x = start_x;
105 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
106 if(exclude_padding)
107 {
108 c_start_x = std::max(0, c_start_x);
109 }
110 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
111 el *= scale;
112 start_x += step * stride_x;
113 }
114
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000115 v = wrapper::vsetlane(elems[0], v, 0);
116 v = wrapper::vsetlane(elems[1], v, 1);
117 v = wrapper::vsetlane(elems[2], v, 2);
118 v = wrapper::vsetlane(elems[3], v, 3);
119 v = wrapper::vsetlane(elems[4], v, 4);
120 v = wrapper::vsetlane(elems[5], v, 5);
121 v = wrapper::vsetlane(elems[6], v, 6);
122 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000123}
124
morgolockcc1f6c92020-03-24 09:26:48 +0000125Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
126 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100127{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000128 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100129
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000130 int pool_stride_x = 0;
131 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000132 PoolingType pool_type = pool_info.pool_type;
133 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100135
Anthony Barbiereaefd002018-07-20 17:49:35 +0100136 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000137 if(indices)
138 {
morgolock37722d92020-04-09 14:17:48 +0100139 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
morgolockcc1f6c92020-03-24 09:26:48 +0000140 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
141 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
142 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000144 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000145 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
146 && (input->data_layout() == DataLayout::NHWC),
147 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000148
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000149 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100150 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000151 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000152 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
153 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
154 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000155
156 if(indices)
157 {
158 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
morgolockcc1f6c92020-03-24 09:26:48 +0000159 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
160 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
161 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100162 }
163
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000164 return Status{};
165}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100166
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000167Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000168{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000169 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
170 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000171
172 return Status{};
173}
174
morgolockcc1f6c92020-03-24 09:26:48 +0000175std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
176 unsigned int &num_elems_processed_per_iteration,
177 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000178 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000179{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100180 // Output auto inizialitation if not yet initialized
181 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000182 if(indices)
183 {
184 // Indices auto inizialitation if not yet initialized
morgolocke383c352020-04-03 16:57:46 +0100185 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
186 pool_info)))
187 .set_data_type(DataType::U32) /* we store the offset to the element */);
morgolockcc1f6c92020-03-24 09:26:48 +0000188 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000189 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000190 unsigned int num_elems_read_per_iteration = 0;
191 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000192 int pool_stride_x = 0;
193 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000194 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
195 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
196 const int input_width = input->dimension(idx_width);
197 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000198 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000199 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000200 const int pool_pad_right = pad_stride_info.pad_right();
201 const int pool_pad_top = pad_stride_info.pad_top();
202 const int pool_pad_left = pad_stride_info.pad_left();
203 const int pool_pad_bottom = pad_stride_info.pad_bottom();
204 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000205
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000206 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000207 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
208 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000209 pool_size_x,
210 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000211 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100212
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000213 //If it's not squared and optimized will be executed the MxN
214 num_elems_read_per_iteration = 1;
215 num_elems_processed_per_iteration = 1;
216 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217
Michalis Spyrou57dac842018-03-01 16:03:50 +0000218 const bool is_nhwc = data_layout == DataLayout::NHWC;
219
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000220 if(is_square)
221 {
222 switch(input->data_type())
223 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000224 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000225 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000226 if(is_nhwc)
227 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100228 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000229 break;
230 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000231 switch(pool_size_x)
232 {
233 case 2:
234 num_elems_read_per_iteration = 16;
235 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
236 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
237 break;
238 case 3:
239 num_elems_read_per_iteration = 16;
240 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
241 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
242 break;
243 default:
244 break;
245 }
246 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000247#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
248 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000249 if(is_nhwc)
250 {
251 num_elems_processed_per_iteration = 8;
252 break;
253 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000254 switch(pool_size_x)
255 {
256 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000257 case 3:
258 num_elems_read_per_iteration = 4;
259 num_elems_processed_per_iteration = 1;
260 num_elems_horizontal_window = 1;
261 break;
262 default:
263 break;
264 }
265 break;
266#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
267 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000268 if(is_nhwc)
269 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100270 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000271 break;
272 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000273 switch(pool_size_x)
274 {
275 case 2:
276 num_elems_read_per_iteration = 2;
277 break;
278 case 3:
279 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
280 break;
281 case 7:
282 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
283 break;
284 default:
285 break;
286 }
287 num_elems_processed_per_iteration = 1;
288 num_elems_horizontal_window = 1;
289 break;
290 default:
291 ARM_COMPUTE_ERROR("Element size not supported");
292 break;
293 }
294 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000295 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000296 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000297 if(is_nhwc)
298 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100299 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000300 }
301 }
302
303 bool window_changed = false;
304 Window win{};
305 if(data_layout == DataLayout::NCHW)
306 {
307 // Number of iterations in X dimension
308 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000309 // Upper limit for the number of right/bottom border elements that are accessed
310 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
311 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000312 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
313 border_size.right = std::max(upper_bound_w, pool_pad_right);
314 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000315 TensorShape output_shape{ input->tensor_shape() };
316 output_shape.set(0, pooled_w);
317 output_shape.set(1, pooled_h);
318 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000319 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000320 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000321 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000322 if(indices)
323 {
324 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
325 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
326 }
327 else
328 {
329 window_changed = update_window_and_padding(win, input_access, output_access);
330 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000331 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
332 }
333 else
334 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000335 TensorShape output_shape{ input->tensor_shape() };
336 output_shape.set(1, pooled_w);
337 output_shape.set(2, pooled_h);
338 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000339 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
340 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000341 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
morgolockcc1f6c92020-03-24 09:26:48 +0000342 if(indices)
343 {
344 AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
345 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
346 }
347 else
348 {
349 window_changed = update_window_and_padding(win, input_access, output_access);
350 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000351 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000352 }
353
354 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
355 return std::make_pair(err, win);
356}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000357
358template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000359inline T vcvtq_q32_f32(float32x4_t values);
360
361template <>
362inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
363{
364 return vcvtq_u32_f32(values);
365}
366
367template <>
368inline int32x4_t vcvtq_q32_f32(float32x4_t values)
369{
370 return vcvtq_s32_f32(values);
371}
372
373template <typename T>
374inline float32x4_t vcvtq_f32_q32(T values);
375
376template <>
377inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
378{
379 return vcvtq_f32_u32(values);
380}
381
382template <>
383inline float32x4_t vcvtq_f32_q32(int32x4_t values)
384{
385 return vcvtq_f32_s32(values);
386}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000387
388template <typename Tout>
389inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
390
391template <>
392inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
393{
394 const float new_scale = quant_rescale / scale_pooling;
395 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
396}
397
398template <>
399inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
400{
401 const float new_scale = quant_rescale / scale_pooling;
402 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
403}
404
405template <typename Tin, typename Tout>
406inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
407
408template <>
409inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
410{
411 const float32x4x4_t acc =
412 {
413 {
414 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
415 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
416 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
417 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
418 }
419 };
420 return vquantize(acc, requant_qinfo);
421}
422
423template <>
424inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
425{
426 const float32x4x4_t acc =
427 {
428 {
429 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
430 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
431 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
432 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
433 }
434 };
435 return vquantize_signed(acc, requant_qinfo);
436}
437
438template <typename T>
439inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
440
441template <>
442inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
443{
444 const float32x4x2_t acc =
445 {
446 {
447 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
448 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
449 }
450 };
451 return vquantize(acc, requant_qinfo);
452}
453
454template <>
455inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
456{
457 const float32x4x2_t acc =
458 {
459 {
460 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
461 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
462 }
463 };
464 return vquantize_signed(acc, requant_qinfo);
465}
466
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000467} // namespace
468
469NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000470 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000471{
472}
473
474BorderSize NEPoolingLayerKernel::border_size() const
475{
476 return _border_size;
477}
478
morgolockcc1f6c92020-03-24 09:26:48 +0000479void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000480{
481 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000482 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
483 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000484 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000485
486 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000487 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
488 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
489 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000490
491 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000492 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000493 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
494 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000495
496 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000497 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000498
499 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100500 unsigned int pooled_w;
501 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000502 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
503 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000504 pool_size.x(),
505 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000506 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000507
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000508 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000509 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100510
511 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000512 _input = input;
513 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000514 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000515 _pool_info = pool_info;
516 _data_layout = input->info()->data_layout();
517 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100518
Georgios Pinitas55186712018-01-08 17:37:12 +0000519 // Get data type
520 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000521 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000522
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100523 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000524 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000525 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000526 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000527 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100528 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000529 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000530 }
531 else
532 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000533 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000534 }
535 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000536 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000537 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000538 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000539 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000540 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000541 }
542 else
543 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000544 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000545 }
546 }
547 else
548 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000549 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000550 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000551 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000552 }
553 else
554 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000555 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
556 }
557 }
558 }
559 else if(data_type == DataType::QASYMM8_SIGNED)
560 {
561 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
562 {
563 if(is_nchw)
564 {
565 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
566 }
567 else
568 {
569 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
570 }
571 }
572 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
573 {
574 if(is_nchw)
575 {
576 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
577 }
578 else
579 {
580 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
581 }
582 }
583 else
584 {
585 if(is_nchw)
586 {
587 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
588 }
589 else
590 {
591 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000592 }
593 }
594 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000595 else if(data_type == DataType::F16)
596 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000597 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000598 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000599 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000600 {
601 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000602 {
603 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000604 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000605 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000606 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000607 else
608 {
609 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
610 }
611 }
612 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000613 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000614 {
615 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000616 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000617 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000618 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000619 else
620 {
621 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
622 }
623 }
624 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000625 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000626 {
627 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000628 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000629 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
630 }
631 else
632 {
633 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000634 }
635 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000636 }
637 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000638 }
639 }
640 else
641 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000642 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000643 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000644 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
645 }
646 else
647 {
648 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000649 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000650 }
651 }
652 else if(data_type == DataType::F32)
653 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000654 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000655 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000656 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000657 {
658 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000659 {
660 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000661 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000662 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
663 }
664 else
665 {
666 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000667 }
668 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000669 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000670 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000671 {
672 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000673 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000674 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
675 }
676 else
677 {
678 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000679 }
680 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000681 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000682 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000683 {
684 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000685 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000686 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
687 }
688 else
689 {
690 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000691 }
692 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000693 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000694 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000695 {
696 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000697 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000698 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
699 }
700 else
701 {
702 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000703 }
704 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000705 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000706 }
707 }
708 else
709 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000710 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000711 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000712 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
713 }
714 else
715 {
716 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000717 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000718 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100719 }
720
721 // Configure kernel window
morgolockcc1f6c92020-03-24 09:26:48 +0000722 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
723 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000724 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
725 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100726}
727
Sheri Zhange0681992020-07-14 15:29:28 +0100728template <typename T = float>
729inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
730{
731 const int pad_left = info.padding().left;
732 const int pad_right = info.padding().right;
733 const int pad_top = info.padding().top;
734 const int pad_bottom = info.padding().bottom;
735 const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
736 const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
737 const int pad_horiz = pad_left + pad_right;
738 const int pad_vert = pad_top + pad_bottom;
739
740 if(info.data_layout() == DataLayout::NCHW)
741 {
742 const uint32_t offset_base = padded_offset
743 - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
744 - pad_top * sizeof(T) /* top padding */
745 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
746 - in_stride_w * id[3];
747
748 return offset_base;
749 }
750 else
751 {
752 const uint32_t offset_base = padded_offset
753 - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
754 - pad_top * sizeof(T) // top padding
755 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
756 - in_stride_w * id[3];
757
758 return offset_base;
759 }
760}
761
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000762template <typename T>
763void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000764{
765 Iterator input(_input, window_input);
766 Iterator output(_output, window);
767
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000768 /** NEON vector types */
769 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
770 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
771 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
772 using q16_t = typename wrapper::traits::promote_t<T>;
773 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
774 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
775 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
776
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000777 constexpr int pool_size = 2;
778 int pool_stride_x = 0;
779 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000780 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
781 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
782 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
783 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
784 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000785 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
786 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000787
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000788 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
789 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000790
791 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
792
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100793 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
794 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
795 const bool have_different_qinfo = input_qinfo != output_qinfo;
796
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000797 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
798 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
799 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
800
Georgios Pinitas55186712018-01-08 17:37:12 +0000801 execute_window_loop(window, [&](const Coordinates & id)
802 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000803 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
804 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
805 q8x8_t lower_res = {};
806 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000807
808 if(pooling_type != PoolingType::MAX)
809 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000810 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
811 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000812
813 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000814 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000815 {
816 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000817 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
818 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000819 }
820 };
821
822 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000823 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
824 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000825
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000826 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000827
828 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000829 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
830 pool_size, upper_bound_w, upper_bound_h,
831 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
832 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000833
834 // Compute upper result for stride_x == 1
835 if(pool_stride_x == 1)
836 {
837 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000838 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000839 {
840 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000841 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
842 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000843 }
844 };
845
846 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000847 q16x8_t res_upper = wrapper::vcombine(
848 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
849 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000850
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000851 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000852 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
853 pool_size, upper_bound_w, upper_bound_h,
854 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
855 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000856 }
857 }
858 else
859 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000860 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
861 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000862 if(pool_stride_x == 1)
863 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000864 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
865 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000866 }
867 }
868
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100869 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100870 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000871 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000872 lower_res = wrapper::vgetlow(requantized_output);
873 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100874 }
875
Georgios Pinitas55186712018-01-08 17:37:12 +0000876 // Store result
877 if(pool_stride_x == 1)
878 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000879 const q8x8x2_t res = { { lower_res, upper_res } };
880 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000881 }
882 else
883 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000884 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000885 }
886 },
887 input, output);
888}
889
Pablo Tello77e6c552018-12-04 15:33:49 +0000890void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100891{
Pablo Tello77e6c552018-12-04 15:33:49 +0000892 ARM_COMPUTE_UNUSED(pooling_type);
893 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000894#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100895 Iterator input(_input, window_input);
896 Iterator output(_output, window);
897
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000898 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000899 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
900 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
901 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
902 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000903 int pool_stride_x = 0;
904 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000905 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000906 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
907 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100908
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000909 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
910 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
911 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100912
913 execute_window_loop(window, [&](const Coordinates & id)
914 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100915 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
916 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
917 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
918 float16x4_t res = {};
919
920 // Get power of 2 in case of l2 pooling
921 if(pooling_type == PoolingType::L2)
922 {
923 top_data = vmul_f16(top_data, top_data);
924 middle_data = vmul_f16(middle_data, middle_data);
925 bottom_data = vmul_f16(bottom_data, bottom_data);
926 }
927
928 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100929 {
930 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000931 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100932 const float16x4_t scale_v = vdup_n_f16(scale);
933 // Perform pooling
934 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
935 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
936 res = vmul_f16(vpadd_f16(res, res), scale_v);
937 }
938 else
939 {
940 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
941 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
942 res = vpmax_f16(res, res);
943 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100944
945 // Calculate square-root in case of l2 pooling
946 if(pooling_type == PoolingType::L2)
947 {
948 res = vinv_f16(vinvsqrt_f16(res));
949 }
950
Pablo Tello0c34fe22017-06-26 17:17:42 +0100951 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
952 },
953 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000954#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100955 ARM_COMPUTE_UNUSED(window_input);
956 ARM_COMPUTE_UNUSED(window);
957 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000958#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100959}
960
Sheri Zhange0681992020-07-14 15:29:28 +0100961#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
962template <typename T>
963inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
964f16_to_f32(float16x4_t input)
965{
966 float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
967 return output;
968}
969#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
970
971template <typename T>
972inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
973f16_to_f32(float32x2_t input)
974{
975 return input;
976}
977
978template <typename T = float>
979void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
980{
981 Iterator input(_input, window_input);
982 Iterator output(_output, window);
983 Iterator indices(_indices, window);
984 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
985 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
986 int pool_stride_x = 0;
987 int pool_stride_y = 0;
988 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
989 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
990 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
991 const int pad_left = _input->info()->padding().left;
992 const int pad_right = _input->info()->padding().right;
993 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
994
995 execute_window_loop(window, [&](const Coordinates & id)
996 {
997 auto top_data = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
998 auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
999 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
1000 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
1001
1002 // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
1003 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
1004 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
1005 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
1006 *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
1007
1008 // Calculate max data indice, which will be used in max unpool.
1009 const uint32_t offset_base = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1010 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
1011 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
1012 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
1013 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
1014 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
1015 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
1016 *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
1017 },
1018 input, output, indices);
1019}
1020
Pablo Tello77e6c552018-12-04 15:33:49 +00001021void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +01001022{
Pablo Tello77e6c552018-12-04 15:33:49 +00001023 ARM_COMPUTE_UNUSED(pooling_type);
1024 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001025#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +01001026 if(pooling_type == PoolingType::MAX && _indices)
Pablo Tello0c34fe22017-06-26 17:17:42 +01001027 {
Sheri Zhange0681992020-07-14 15:29:28 +01001028 pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
1029 }
1030 else
1031 {
1032 Iterator input(_input, window_input);
1033 Iterator output(_output, window);
1034 constexpr int pool_size = 2;
1035 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1036 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1037 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1038 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1039 int pool_stride_x, pool_stride_y = 0;
1040 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1041 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1042 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +01001043
Sheri Zhange0681992020-07-14 15:29:28 +01001044 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1045 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1046
1047 execute_window_loop(window, [&](const Coordinates & id)
Georgios Pinitascdf51452017-08-31 14:21:36 +01001048 {
Sheri Zhange0681992020-07-14 15:29:28 +01001049 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
1050 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
1051 float16x4_t res = {};
Georgios Pinitascdf51452017-08-31 14:21:36 +01001052
Sheri Zhange0681992020-07-14 15:29:28 +01001053 // Get power of 2 in case of l2 pooling
1054 if(pooling_type == PoolingType::L2)
1055 {
1056 top_data = vmul_f16(top_data, top_data);
1057 bottom_data = vmul_f16(bottom_data, bottom_data);
1058 }
Georgios Pinitas13d96e02018-08-23 11:20:23 +01001059
Sheri Zhange0681992020-07-14 15:29:28 +01001060 if(pooling_type != PoolingType::MAX)
1061 {
1062 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1063 const float16x4_t scale_v = vdup_n_f16(scale);
Georgios Pinitascdf51452017-08-31 14:21:36 +01001064
Sheri Zhange0681992020-07-14 15:29:28 +01001065 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
1066 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
1067 }
1068 else
1069 {
1070 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
1071 res = vpmax_f16(max_data, max_data);
1072 }
Georgios Pinitascdf51452017-08-31 14:21:36 +01001073
Sheri Zhange0681992020-07-14 15:29:28 +01001074 // Calculate square-root in case of l2 pooling
1075 if(pooling_type == PoolingType::L2)
1076 {
1077 res = vinv_f16(vinvsqrt_f16(res));
1078 }
1079
1080 // Store result
1081 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
1082 },
1083 input, output);
1084 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001085#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001086 ARM_COMPUTE_UNUSED(window_input);
1087 ARM_COMPUTE_UNUSED(window);
1088 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001089#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +01001090}
1091
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001092template <typename T>
1093void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001094{
1095 Iterator input(_input, window_input);
1096 Iterator output(_output, window);
1097
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001098 /** NEON vector types */
1099 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1100 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1101 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1102 using q16_t = typename wrapper::traits::promote_t<T>;
1103 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1104 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1105
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001106 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001107 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1108 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1109 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1110 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001111 int pool_stride_x = 0;
1112 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001113 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001114 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1115 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001116
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001117 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1118 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001119
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001120 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1121 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1122 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1123
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001124 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1125 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1126 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001127
1128 execute_window_loop(window, [&](const Coordinates & id)
1129 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001130 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1131 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1132 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1133 q8x8_t fres = {};
1134 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001135
1136 if(pooling_type == PoolingType::AVG)
1137 {
1138 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001139 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1140 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1141 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001142
1143 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001144 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001145 {
1146 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001147 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1148 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001149 }
1150 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001151 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001152 {
1153 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001154 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1155 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001156 }
1157 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001158 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001159 {
1160 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001161 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1162 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001163 }
1164 };
1165 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001166 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001167 {
1168 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001169 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1170 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001171 }
1172 };
1173 if(pool_stride_x == 2)
1174 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001175 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001176 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001177 wrapper::vgetlane(final_sum.val[0], 0),
1178 wrapper::vgetlane(final_sum.val[0], 2),
1179 wrapper::vgetlane(final_sum.val[0], 4),
1180 wrapper::vgetlane(final_sum.val[0], 6),
1181 wrapper::vgetlane(final_sum.val[1], 0),
1182 wrapper::vgetlane(final_sum.val[1], 2),
1183 wrapper::vgetlane(final_sum.val[1], 4),
1184 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001185 };
1186
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001187 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1188 pool_size, upper_bound_w, upper_bound_h,
1189 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1190 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001191 }
1192 else
1193 {
1194 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001195 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1196 pool_size, upper_bound_w, upper_bound_h,
1197 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001198 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001199 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1200 pool_size, upper_bound_w, upper_bound_h,
1201 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1202 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001203 }
1204 }
1205 else
1206 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001207 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1208 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1209 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1210 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001211
1212 if(pool_stride_x == 2)
1213 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001214 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1215 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1216 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001217 }
1218 else
1219 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001220 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001221 }
1222 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001223
1224 // Store result
1225 if(pool_stride_x == 1)
1226 {
1227 if(input_qinfo != output_qinfo)
1228 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001229 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001230 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001231 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001232 }
1233 else
1234 {
1235 if(input_qinfo != output_qinfo)
1236 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001237 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001238 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001239 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001240 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001241 },
1242 input, output);
1243}
1244
Pablo Tello77e6c552018-12-04 15:33:49 +00001245void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001246{
Pablo Tello77e6c552018-12-04 15:33:49 +00001247 ARM_COMPUTE_UNUSED(pooling_type);
1248 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001249#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1250 Iterator input(_input, window_input);
1251 Iterator output(_output, window);
1252
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001253 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1254 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1255 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1256 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1257 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1258 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001259 int pool_stride_x = 0;
1260 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001261 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001262 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1263 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1264
1265 execute_window_loop(window, [&](const Coordinates & id)
1266 {
1267 float16_t res = 0.0f;
1268 float16x8_t vres = vdupq_n_f16(0.0f);
1269
1270 if(pooling_type != PoolingType::MAX)
1271 {
1272 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001273 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001274
1275 // Perform pooling
1276
1277 for(int y = 0; y < pool_size_y; ++y)
1278 {
1279 int x = 0;
1280 for(; x <= (pool_size_x - 8); x += 8)
1281 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001282 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1283 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001284
1285 // Get power of 2 in case of l2 pooling and accumulate
1286 if(pooling_type == PoolingType::L2)
1287 {
1288 vres = vaddq_f16(vres, vmulq_f16(data, data));
1289 }
1290 else
1291 {
1292 vres = vaddq_f16(vres, data);
1293 }
1294 }
1295
1296 // Leftover for loop
1297 for(; x < pool_size_x; ++x)
1298 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001299 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1300 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001301
1302 // Get power of 2 in case of l2 pooling
1303 if(pooling_type == PoolingType::L2)
1304 {
1305 data *= data;
1306 }
1307
1308 res += data;
1309 }
1310 }
1311
1312 // Reduction
1313 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1314 res += vget_lane_f16(tmp, 0);
1315 res += vget_lane_f16(tmp, 1);
1316 res += vget_lane_f16(tmp, 2);
1317 res += vget_lane_f16(tmp, 3);
1318
1319 // Divide by scale
1320 res *= scale;
1321 }
1322 else
1323 {
1324 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1325 res = std::numeric_limits<float>::lowest();
1326
1327 for(int y = 0; y < pool_size_y; ++y)
1328 {
1329 int x = 0;
1330 for(; x <= (pool_size_x - 8); x += 8)
1331 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001332 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1333 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001334 vres = vmaxq_f16(vres, data);
1335 }
1336
1337 // Leftover for loop
1338 for(; x < pool_size_x; ++x)
1339 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001340 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1341 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1342 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001343 }
1344 }
1345
1346 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1347 res = std::max(res, vget_lane_f16(tmp, 0));
1348 res = std::max(res, vget_lane_f16(tmp, 1));
1349 res = std::max(res, vget_lane_f16(tmp, 2));
1350 res = std::max(res, vget_lane_f16(tmp, 3));
1351 }
1352
1353 // Calculate square-root in case of l2 pooling
1354 if(pooling_type == PoolingType::L2)
1355 {
1356 res = std::sqrt(res);
1357 }
1358
1359 // Store result
1360 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1361 },
1362 input, output);
1363
1364#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1365 ARM_COMPUTE_UNUSED(window_input);
1366 ARM_COMPUTE_UNUSED(window);
1367 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1368#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1369}
1370
Sheri Zhange0681992020-07-14 15:29:28 +01001371#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1372void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1373{
1374 Iterator input(_input, window_input);
1375 Iterator output(_output, window);
1376 Iterator indices(_indices, window);
1377
1378 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1379 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1380
1381 int pool_stride_x = 0;
1382 int pool_stride_y = 0;
1383 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1384
1385 const int pad_right = _input->info()->padding().right;
1386 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1387 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1388
1389 execute_window_loop(window, [&](const Coordinates & id)
1390 {
1391 const int idx_width = id.y() * pool_stride_x;
1392 const int idx_height = id.z() * pool_stride_y;
1393 const int pool_limit_y = pool_pad_top - idx_height;
1394 const int pool_limit_x = pool_pad_left - idx_width;
1395
1396 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1397 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1398 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1399 (_input->info()->strides_in_bytes().z());
1400 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1401 (_input->info()->strides_in_bytes().z());
1402
1403 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1404 (_input->info()->strides_in_bytes().z());
1405
1406 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1407 (_input->info()->strides_in_bytes().z());
1408
1409 const auto in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset);
1410 const auto in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset);
1411 const auto in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset);
1412 const auto in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset);
1413 const auto v_x0 = vld1q_f16(in_x0_ptr);
1414 const auto v_x1 = vld1q_f16(in_x1_ptr);
1415 const auto v_x2 = vld1q_f16(in_x2_ptr);
1416 const auto v_x3 = vld1q_f16(in_x3_ptr);
1417 float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
1418 // Store result
1419 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1420
1421 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1422 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t);
1423 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1424 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1425 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1426 const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1427 const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
1428 const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
1429 const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1430 const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
1431 const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
1432 const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1433 const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
1434 const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
1435 const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1436 const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
1437 const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
1438 const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
1439 const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
1440 const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1441 const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
1442 const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
1443 // Store indicies
1444 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indeces3_0);
1445 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16), tmp_indeces3_1);
1446 },
1447 input, output, indices);
1448}
1449#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1450
Pablo Tello77e6c552018-12-04 15:33:49 +00001451void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001452{
Pablo Tello77e6c552018-12-04 15:33:49 +00001453 ARM_COMPUTE_UNUSED(pooling_type);
1454 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001455#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +01001456 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1457 {
1458 pooling2_f16_nhwc_maxpool_indices(window_input, window);
1459 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001460 Iterator input(_input, window_input);
1461 Iterator output(_output, window);
1462
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001463 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1464 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1465 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1466 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1467 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1468 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001469 int pool_stride_x = 0;
1470 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001471 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001472 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1473 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1474
1475 float16x8_t vres;
1476
1477 execute_window_loop(window, [&](const Coordinates & id)
1478 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001479 const int idx_width = id.y() * pool_stride_x;
1480 const int idx_height = id.z() * pool_stride_y;
1481 const int pool_limit_y = pool_pad_top - idx_height;
1482 const int pool_limit_x = pool_pad_left - idx_width;
1483
1484 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1485 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1486 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1487 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1488
Michalis Spyrou57dac842018-03-01 16:03:50 +00001489 if(pooling_type != PoolingType::MAX)
1490 {
1491 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001492 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1493 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001494 const float16x8_t scale_v = vdupq_n_f16(scale);
1495
1496 // Perform pooling
1497 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001498 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001499 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001500 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001501 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001502 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1503 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001504
1505 // Get power of 2 in case of l2 pooling and accumulate
1506 if(pooling_type == PoolingType::L2)
1507 {
1508 vres = vaddq_f16(vres, vmulq_f16(data, data));
1509 }
1510 else
1511 {
1512 vres = vaddq_f16(vres, data);
1513 }
1514 }
1515 }
1516 // Divide by scale
1517 vres = vmulq_f16(vres, scale_v);
1518 }
1519 else
1520 {
1521 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001522
1523 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001524 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001525 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001526 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001527 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1528 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001529 vres = vmaxq_f16(vres, data);
1530 }
1531 }
1532 }
1533
1534 // Calculate square-root in case of l2 pooling
1535 if(pooling_type == PoolingType::L2)
1536 {
1537 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1538 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1539 }
1540
1541 // Store result
1542 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1543 },
1544 input, output);
1545
1546#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1547 ARM_COMPUTE_UNUSED(window_input);
1548 ARM_COMPUTE_UNUSED(window);
1549 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1550#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1551}
1552
Pablo Tello77e6c552018-12-04 15:33:49 +00001553void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001554{
1555 Iterator input(_input, window_input);
1556 Iterator output(_output, window);
1557
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001558 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1559 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1560 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1561 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1562 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1563 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001564 int pool_stride_x = 0;
1565 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001566 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001567 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1568 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001569
1570 execute_window_loop(window, [&](const Coordinates & id)
1571 {
1572 float res = 0.0f;
1573
1574 if(pooling_type != PoolingType::MAX)
1575 {
1576 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001577 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001578
1579 // Perform pooling
1580 float32x4_t vres = vdupq_n_f32(0.0f);
1581
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001582 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001583 {
1584 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001585 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001586 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001587 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1588 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001589
1590 // Get power of 2 in case of l2 pooling and accumulate
1591 if(pooling_type == PoolingType::L2)
1592 {
1593 vres = vmlaq_f32(vres, data, data);
1594 }
1595 else
1596 {
1597 vres = vaddq_f32(vres, data);
1598 }
1599 }
1600
1601 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001602 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001603 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001604 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1605 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001606
1607 // Get power of 2 in case of l2 pooling
1608 if(pooling_type == PoolingType::L2)
1609 {
1610 data *= data;
1611 }
1612
1613 res += data;
1614 }
1615 }
1616
1617#if defined(__aarch64__)
1618 // Reduction operation available on 64 bit architectures only
1619 res += vaddvq_f32(vres);
1620#else // __aarch64__
1621 // Reduction
1622 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1623 tmp = vpadd_f32(tmp, tmp);
1624
1625 res += vget_lane_f32(tmp, 0);
1626#endif // __aarch64__
1627 // Divide by scale
1628 res *= scale;
1629 }
1630 else
1631 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001632 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1633 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001634
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001635 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001636 {
1637 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001638 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001639 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001640 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1641 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001642 vres = vmaxq_f32(vres, data);
1643 }
1644
1645 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001646 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001647 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001648 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1649 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001650 res = std::max(res, data);
1651 }
1652 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001653#if defined(__aarch64__)
1654 // Reduction operation available on 64 bit architectures only
1655 res = std::max(vmaxvq_f32(vres), res);
1656#else // __aarch64__
1657 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1658 tmp = vpmax_f32(tmp, tmp);
1659
1660 res = std::max(res, vget_lane_f32(tmp, 0));
1661#endif // __aarch64__
1662 }
1663
1664 // Calculate square-root in case of l2 pooling
1665 if(pooling_type == PoolingType::L2)
1666 {
1667 res = std::sqrt(res);
1668 }
1669
1670 // Store result
1671 *(reinterpret_cast<float *>(output.ptr())) = res;
1672 },
1673 input, output);
1674}
1675
morgolockcc1f6c92020-03-24 09:26:48 +00001676void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1677 bool exclude_padding)
1678{
1679 if(pooling_type == PoolingType::MAX && _indices)
1680 {
Sheri Zhange0681992020-07-14 15:29:28 +01001681 pooling2_nchw_maxpool_indices<float>(window_input, window);
morgolockcc1f6c92020-03-24 09:26:48 +00001682 }
1683 else
1684 {
1685 Iterator input(_input, window_input);
1686 Iterator output(_output, window);
1687 constexpr int pool_size = 2;
1688 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1689 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1690 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1691 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1692 int pool_stride_x = 0;
1693 int pool_stride_y = 0;
1694 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1695 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1696 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1697
1698 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1699 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1700
1701 execute_window_loop(window, [&](const Coordinates & id)
1702 {
1703 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1704 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1705 float32x2_t top_data = vld1_f32(in_top_ptr);
1706 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1707 float32x2_t res = {};
1708 float final_res = 0;
1709 // Get power of 2 in case of l2 pooling
1710 if(pooling_type == PoolingType::L2)
1711 {
1712 top_data = vmul_f32(top_data, top_data);
1713 bottom_data = vmul_f32(bottom_data, bottom_data);
1714 }
1715
1716 if(pooling_type != PoolingType::MAX)
1717 {
1718 // Calculate scale
1719 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1720 const float32x2_t scale_v = vdup_n_f32(scale);
1721
1722 // Perform pooling
1723 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1724 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1725 }
1726 else
1727 {
1728 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1729 res = vpmax_f32(max_data, max_data);
1730 }
1731 final_res = vget_lane_f32(res, 0);
1732
1733 // Calculate square-root in case of l2 pooling
1734 if(pooling_type == PoolingType::L2)
1735 {
1736 final_res = sqrt(final_res);
1737 }
1738
1739 // Store result
1740 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1741 },
1742 input, output);
1743 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001744}
1745
1746void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1747{
1748 Iterator input(_input, window_input);
1749 Iterator output(_output, window);
1750
1751 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001752 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1753 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1754 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1755 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001756 int pool_stride_x = 0;
1757 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001758 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001759 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1760 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1761
1762 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1763 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1764 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1765
1766 execute_window_loop(window, [&](const Coordinates & id)
1767 {
1768 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1769 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1770 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1771 float32x2_t res = {};
1772 float final_res = 0;
1773
1774 // Get power of 2 in case of l2 pooling
1775 if(pooling_type == PoolingType::L2)
1776 {
1777 top_data = vmulq_f32(top_data, top_data);
1778 middle_data = vmulq_f32(middle_data, middle_data);
1779 bottom_data = vmulq_f32(bottom_data, bottom_data);
1780 }
1781
1782 if(pooling_type != PoolingType::MAX)
1783 {
1784 // Calculate scale
1785 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1786 const float32x2_t scale_v = vdup_n_f32(scale);
1787
1788 // Perform pooling
1789 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1790 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1791 res = vmul_f32(vpadd_f32(res, res), scale_v);
1792 }
1793 else
1794 {
1795 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1796 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1797 res = vpmax_f32(res, res);
1798 }
1799 final_res = vget_lane_f32(res, 0);
1800
1801 // Calculate square-root in case of l2 pooling
1802 if(pooling_type == PoolingType::L2)
1803 {
1804 final_res = sqrt(final_res);
1805 }
1806
1807 // Store result
1808 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1809 },
1810 input, output);
1811}
1812
1813void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1814{
1815 Iterator input(_input, window_input);
1816 Iterator output(_output, window);
1817
1818 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001819 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1820 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1821 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1822 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001823 int pool_stride_x = 0;
1824 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001825 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001826 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1827 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1828
1829 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1830 for(int i = 0; i < pool_size; ++i)
1831 {
1832 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1833 }
1834
1835 execute_window_loop(window, [&](const Coordinates & id)
1836 {
1837 float32x2_t res = {};
1838 float final_res = 0.f;
1839 if(pooling_type != PoolingType::MAX)
1840 {
1841 // Calculate scale
1842 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1843 const float32x2_t scale_v = vdup_n_f32(scale);
1844
1845 // Perform pooling
1846 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1847 // Get power of 2 in case of l2 pooling
1848 if(pooling_type == PoolingType::L2)
1849 {
1850 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1851 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1852 }
1853 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1854 for(int i = 1; i < pool_size; ++i)
1855 {
1856 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1857 // Get power of 2 in case of l2 pooling
1858 if(pooling_type == PoolingType::L2)
1859 {
1860 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1861 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1862 }
1863 sum_data = vaddq_f32(sum_data, data.val[0]);
1864 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1865 }
1866 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1867 res = vmul_f32(vpadd_f32(res, res), scale_v);
1868 }
1869 else
1870 {
1871 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1872 for(int i = 1; i < pool_size; ++i)
1873 {
1874 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1875 max_data = vmax2q_f32(max_data, data);
1876 }
1877 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1878 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1879 res = vpmax_f32(res, res);
1880 }
1881 final_res = vget_lane_f32(res, 0);
1882
1883 // Calculate square-root in case of l2 pooling
1884 if(pooling_type == PoolingType::L2)
1885 {
1886 final_res = sqrt(final_res);
1887 }
1888
1889 // Store result
1890 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1891 },
1892 input, output);
1893}
1894
1895void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001896{
morgolocke383c352020-04-03 16:57:46 +01001897 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1898 {
1899 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1900 }
1901 else
1902 {
1903 Iterator input(_input, window_input);
1904 Iterator output(_output, window);
1905
1906 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1907 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1908 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1909 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1910 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1911 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1912 int pool_stride_x = 0;
1913 int pool_stride_y = 0;
1914 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1915 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1916 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1917
1918 float32x4_t vres;
1919
1920 execute_window_loop(window, [&](const Coordinates & id)
1921 {
1922 const int idx_width = id.y() * pool_stride_x;
1923 const int idx_height = id.z() * pool_stride_y;
1924 const int pool_limit_y = pool_pad_top - idx_height;
1925 const int pool_limit_x = pool_pad_left - idx_width;
1926
1927 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1928 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1929 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1930 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1931
1932 if(pooling_type != PoolingType::MAX)
1933 {
1934 // Calculate scale
1935 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1936 pool_stride_y);
1937 const float32x4_t scale_v = vdupq_n_f32(scale);
1938
1939 // Perform pooling
1940 vres = vdupq_n_f32(0.0f);
1941
1942 for(int y = pool_start_y; y < pool_end_y; ++y)
1943 {
1944 for(int x = pool_start_x; x < pool_end_x; ++x)
1945 {
1946 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1947 (_input->info()->strides_in_bytes().z())));
1948
1949 // Get power of 2 in case of l2 pooling and accumulate
1950 if(pooling_type == PoolingType::L2)
1951 {
1952 vres = vmlaq_f32(vres, data, data);
1953 }
1954 else
1955 {
1956 vres = vaddq_f32(vres, data);
1957 }
1958 }
1959 }
1960 // Divide by scale
1961 vres = vmulq_f32(vres, scale_v);
1962 }
1963 else
1964 {
1965 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1966 for(int y = pool_start_y; y < pool_end_y; ++y)
1967 {
1968 for(int x = pool_start_x; x < pool_end_x; ++x)
1969 {
1970 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1971 (_input->info()->strides_in_bytes().z())));
1972 vres = vmaxq_f32(vres, data);
1973 }
1974 }
1975 }
1976
1977 // Calculate square-root in case of l2 pooling
1978 if(pooling_type == PoolingType::L2)
1979 {
1980 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1981 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1982 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1983 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1984 };
1985 vres = l2_res;
1986 }
1987
1988 // Store result
1989 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1990 },
1991 input, output);
1992 }
1993}
1994
1995void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1996{
Michalis Spyrou57dac842018-03-01 16:03:50 +00001997 Iterator input(_input, window_input);
1998 Iterator output(_output, window);
morgolocke383c352020-04-03 16:57:46 +01001999 Iterator indices(_indices, window);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002000
morgolocke383c352020-04-03 16:57:46 +01002001 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2002 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2003
2004 int pool_stride_x = 0;
2005 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002006 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002007
2008 float32x4_t vres;
2009
morgolocke383c352020-04-03 16:57:46 +01002010 const int pad_right = _input->info()->padding().right;
morgolocke383c352020-04-03 16:57:46 +01002011 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
2012 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002013
Michalis Spyrou57dac842018-03-01 16:03:50 +00002014 execute_window_loop(window, [&](const Coordinates & id)
2015 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002016 const int idx_width = id.y() * pool_stride_x;
2017 const int idx_height = id.z() * pool_stride_y;
2018 const int pool_limit_y = pool_pad_top - idx_height;
2019 const int pool_limit_x = pool_pad_left - idx_width;
2020
2021 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
Michalis Spyrouced25572018-10-01 16:26:20 +01002022 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
morgolocke383c352020-04-03 16:57:46 +01002023 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2024 (_input->info()->strides_in_bytes().z());
2025 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2026 (_input->info()->strides_in_bytes().z());
Michalis Spyrouced25572018-10-01 16:26:20 +01002027
morgolocke383c352020-04-03 16:57:46 +01002028 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2029 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00002030
morgolocke383c352020-04-03 16:57:46 +01002031 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2032 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00002033
morgolocke383c352020-04-03 16:57:46 +01002034 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
2035 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
2036 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
2037 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
2038 const auto v_x0 = vld1q_f32(in_x0_ptr);
2039 const auto v_x1 = vld1q_f32(in_x1_ptr);
2040 const auto v_x2 = vld1q_f32(in_x2_ptr);
2041 const auto v_x3 = vld1q_f32(in_x3_ptr);
2042 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002043 // Store result
2044 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
morgolocke383c352020-04-03 16:57:46 +01002045
morgolock37722d92020-04-09 14:17:48 +01002046 const uint32_t offset_base = offset_no_padding(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2047 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float);
2048 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2049 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2050 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
morgolocke383c352020-04-03 16:57:46 +01002051 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
2052 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
2053 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
2054 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
Sheri Zhange0681992020-07-14 15:29:28 +01002055 const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
2056 const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
2057 const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
2058 // Store indices
morgolocke383c352020-04-03 16:57:46 +01002059 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indices2);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002060 },
morgolocke383c352020-04-03 16:57:46 +01002061 input, output, indices);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002062}
2063
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002064template <typename T>
2065void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00002066{
2067 Iterator input(_input, window_input);
2068 Iterator output(_output, window);
2069
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002070 /** NEON vector types */
2071 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2072 using q16_t = typename wrapper::traits::promote_t<T>;
2073 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2074 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2075 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2076
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002077 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
2078 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
2079 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2080 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2081 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2082 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002083 int pool_stride_x = 0;
2084 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002085 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002086 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
2087 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00002088
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002089 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
2090 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
2091
Georgios Pinitas55186712018-01-08 17:37:12 +00002092 execute_window_loop(window, [&](const Coordinates & id)
2093 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002094 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00002095
2096 if(pooling_type != PoolingType::MAX)
2097 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002098 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2099 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00002100
2101 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002102 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00002103
2104 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002105 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002106 {
2107 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002108 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002109 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002110 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2111 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002112
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002113 const q16x8_t data_q16 = wrapper::vmovl(data);
2114 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00002115 }
2116
2117 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002118 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002119 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002120 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2121 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002122 sres += data;
2123 }
2124 }
2125
2126 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002127 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2128 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00002129
2130 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002131 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00002132 }
2133 else
2134 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002135 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00002136
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002137 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002138 {
2139 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002140 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002141 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002142 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2143 (_input->info()->strides_in_bytes().y())));
2144 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002145 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002146 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002147 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002148 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002149 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2150 (_input->info()->strides_in_bytes().y())));
2151 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002152 }
2153 }
2154
2155 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002156 vres = wrapper::vpmax(vres, vres);
2157 vres = wrapper::vpmax(vres, vres);
2158 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00002159
2160 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002161 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00002162 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002163 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002164 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2165 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00002166 },
2167 input, output);
2168}
2169
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002170template <typename T>
2171void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002172{
2173 Iterator input(_input, window_input);
2174 Iterator output(_output, window);
2175
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002176 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2177 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2178 using q16_t = typename wrapper::traits::promote_t<T>;
2179 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2180 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2181 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2182
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002183 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2184 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2185 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2186 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2187 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2188 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002189
2190 int pool_stride_x = 0;
2191 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002192 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002193 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2194 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2195
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002196 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2197 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2198 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00002199
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002200 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002201 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2202 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002203 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002204
2205 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2206 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2207 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2208
Michalis Spyrou57dac842018-03-01 16:03:50 +00002209 execute_window_loop(window, [&](const Coordinates & id)
2210 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002211 const int idx_width = id.y() * pool_stride_x;
2212 const int idx_height = id.z() * pool_stride_y;
2213 const int pool_limit_y = pool_pad_top - idx_height;
2214 const int pool_limit_x = pool_pad_left - idx_width;
2215
2216 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2217 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2218 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2219 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2220
Michalis Spyrou57dac842018-03-01 16:03:50 +00002221 if(pooling_type != PoolingType::MAX)
2222 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002223 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2224 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2225 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2226 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002227
2228 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002229 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2230 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002231
2232 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01002233 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002234 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002235 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002236 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002237 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2238 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002239
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002240 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2241 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2242 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2243 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2244 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2245 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002246 }
2247 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002248
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002249 if(input_qinfo != output_qinfo)
2250 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002251 const float32x4x4_t vres =
2252 {
2253 {
2254 vcvtq_f32_q32(vres1),
2255 vcvtq_f32_q32(vres2),
2256 vcvtq_f32_q32(vres3),
2257 vcvtq_f32_q32(vres4),
2258 }
2259 };
2260 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2261 // Store result
2262 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
2263 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002264 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002265 else
2266 {
2267 const float32x4_t scale_v = vdupq_n_f32(scale);
2268 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2269 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2270 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2271 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2272 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002273
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002274 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2275 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2276 // Store result
2277 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
2278 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
2279 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002280 }
2281 else
2282 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002283 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002284
Michalis Spyrouced25572018-10-01 16:26:20 +01002285 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002286 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002287 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002288 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002289 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2290 (_input->info()->strides_in_bytes().z())));
2291 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002292 }
2293 }
2294
2295 // Store result
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002296 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002297 }
2298 },
2299 input, output);
2300}
2301
morgolockcc1f6c92020-03-24 09:26:48 +00002302Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002303{
2304 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2305
2306 unsigned int pooled_w = 0;
2307 unsigned int pooled_h = 0;
2308 unsigned int num_elems_processed_per_iteration = 0;
2309 BorderSize border_size(0);
2310
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002311 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002312 unsigned int pool_size_x = 0;
2313 unsigned int pool_size_y = 0;
2314
2315 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002316 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2317 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2318 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002319
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002320 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2321 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002322
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002323 // Validate pool info before calling scaled_dimensions
2324 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002325
2326 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002327 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2328 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002329 pool_size_x,
2330 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002331 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002332
morgolockcc1f6c92020-03-24 09:26:48 +00002333 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2334 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2335 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002336 pool_size_x, pool_size_y)
2337 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002338
2339 return Status{};
2340}
2341
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002342void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002343{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002344 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002345 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2346 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2347 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2348
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002349 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2350 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2351 const unsigned int pool_size = _pool_info.pool_size.width;
2352 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002353
Michalis Spyrou57dac842018-03-01 16:03:50 +00002354 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002355 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002356 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002357 // Set step for input in x and y direction for the input
2358 unsigned int window_x_inc = 0;
2359 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002360 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002361 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002362 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002363 {
2364 window_x_inc = pool_stride_x;
2365 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2366 {
2367 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2368 }
2369 break;
2370 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002371
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002372 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002373 case DataType::F32:
2374 {
2375 window_x_inc = pool_stride_x;
2376 break;
2377 }
2378 default:
2379 {
2380 ARM_COMPUTE_ERROR("Not supported");
2381 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002382 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002383 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2384 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002385 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002386 else
2387 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002388 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002389 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2390 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2391 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002392
2393 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002394 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002395}
morgolockcc1f6c92020-03-24 09:26:48 +00002396} // namespace arm_compute