blob: 6d61f51f31e3111f638ede9d6c063d0052de23fa [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002 * Copyright (c) 2017-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Georgios Pinitas55186712018-01-08 17:37:12 +000040#include "support/ToolchainSupport.h"
41
Manuel Bottinib4bb8272019-12-18 18:01:27 +000042#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <algorithm>
44#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010045#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010047#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048#include <string>
49#include <tuple>
50
Manuel Bottinib4bb8272019-12-18 18:01:27 +000051namespace arm_compute
52{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010053using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054
55namespace
56{
Pablo Tello77e6c552018-12-04 15:33:49 +000057inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
59{
Michalis Spyrou57dac842018-03-01 16:03:50 +000060 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
61 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
62
63 int start_x = id[idx_width] * stride_x - pad_x;
64 int start_y = id[idx_height] * stride_y - pad_y;
65
66 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
67 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000068 if(exclude_padding)
69 {
70 start_x = std::max(0, start_x);
71 start_y = std::max(0, start_y);
72 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 return 1.f / ((end_y - start_y) * (end_x - start_x));
74}
75
Manuel Bottinib4bb8272019-12-18 18:01:27 +000076template <typename T, typename TVec>
77inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000078 const int pool_size, const int upper_bound_w, const int upper_bound_h,
79 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
80{
81 int start_x = (id.x() + id_offset) * stride_x - pad_x;
82 int start_y = id.y() * stride_y - pad_y;
83 const int end_y = std::min(start_y + pool_size, upper_bound_h);
84 if(exclude_padding)
85 {
86 start_y = std::max(0, start_y);
87 }
88
Manuel Bottinib4bb8272019-12-18 18:01:27 +000089 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000090 {
91 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000092 wrapper::vgetlane(v, 0),
93 wrapper::vgetlane(v, 1),
94 wrapper::vgetlane(v, 2),
95 wrapper::vgetlane(v, 3),
96 wrapper::vgetlane(v, 4),
97 wrapper::vgetlane(v, 5),
98 wrapper::vgetlane(v, 6),
99 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000100 }
101 };
102
103 for(auto &el : elems)
104 {
105 int c_start_x = start_x;
106 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
107 if(exclude_padding)
108 {
109 c_start_x = std::max(0, c_start_x);
110 }
111 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
112 el *= scale;
113 start_x += step * stride_x;
114 }
115
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000116 v = wrapper::vsetlane(elems[0], v, 0);
117 v = wrapper::vsetlane(elems[1], v, 1);
118 v = wrapper::vsetlane(elems[2], v, 2);
119 v = wrapper::vsetlane(elems[3], v, 3);
120 v = wrapper::vsetlane(elems[4], v, 4);
121 v = wrapper::vsetlane(elems[5], v, 5);
122 v = wrapper::vsetlane(elems[6], v, 6);
123 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000124}
125
morgolockcc1f6c92020-03-24 09:26:48 +0000126Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
127 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100128{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000129 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100130
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000131 int pool_stride_x = 0;
132 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000133 PoolingType pool_type = pool_info.pool_type;
134 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100135 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100136
Anthony Barbiereaefd002018-07-20 17:49:35 +0100137 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000138 if(indices)
139 {
140 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
141 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
142 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000144 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000145 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
146 && (input->data_layout() == DataLayout::NHWC),
147 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000148
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000149 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100150 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000151 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000152 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
153 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
154 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000155
156 if(indices)
157 {
158 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
morgolocke383c352020-04-03 16:57:46 +0100159
morgolockcc1f6c92020-03-24 09:26:48 +0000160 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
161 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
162 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100163 }
164
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000165 return Status{};
166}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100167
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000168Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000169{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000170 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
171 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000172
173 return Status{};
174}
175
morgolockcc1f6c92020-03-24 09:26:48 +0000176std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
177 unsigned int &num_elems_processed_per_iteration,
178 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000179 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100181 // Output auto inizialitation if not yet initialized
182 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000183 if(indices)
184 {
185 // Indices auto inizialitation if not yet initialized
morgolocke383c352020-04-03 16:57:46 +0100186 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
187 pool_info)))
188 .set_data_type(DataType::U32) /* we store the offset to the element */);
morgolockcc1f6c92020-03-24 09:26:48 +0000189 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000190 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000191 unsigned int num_elems_read_per_iteration = 0;
192 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000193 int pool_stride_x = 0;
194 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000195 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
196 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
197 const int input_width = input->dimension(idx_width);
198 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000199 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000200 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000201 const int pool_pad_right = pad_stride_info.pad_right();
202 const int pool_pad_top = pad_stride_info.pad_top();
203 const int pool_pad_left = pad_stride_info.pad_left();
204 const int pool_pad_bottom = pad_stride_info.pad_bottom();
205 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000206
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000207 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000208 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
209 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000210 pool_size_x,
211 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000212 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100213
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000214 //If it's not squared and optimized will be executed the MxN
215 num_elems_read_per_iteration = 1;
216 num_elems_processed_per_iteration = 1;
217 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100218
Michalis Spyrou57dac842018-03-01 16:03:50 +0000219 const bool is_nhwc = data_layout == DataLayout::NHWC;
220
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000221 if(is_square)
222 {
223 switch(input->data_type())
224 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000225 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000226 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000227 if(is_nhwc)
228 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100229 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000230 break;
231 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000232 switch(pool_size_x)
233 {
234 case 2:
235 num_elems_read_per_iteration = 16;
236 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
237 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
238 break;
239 case 3:
240 num_elems_read_per_iteration = 16;
241 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
242 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
243 break;
244 default:
245 break;
246 }
247 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000248#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
249 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000250 if(is_nhwc)
251 {
252 num_elems_processed_per_iteration = 8;
253 break;
254 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000255 switch(pool_size_x)
256 {
257 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000258 case 3:
259 num_elems_read_per_iteration = 4;
260 num_elems_processed_per_iteration = 1;
261 num_elems_horizontal_window = 1;
262 break;
263 default:
264 break;
265 }
266 break;
267#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
268 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000269 if(is_nhwc)
270 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100271 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000272 break;
273 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000274 switch(pool_size_x)
275 {
276 case 2:
277 num_elems_read_per_iteration = 2;
278 break;
279 case 3:
280 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
281 break;
282 case 7:
283 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
284 break;
285 default:
286 break;
287 }
288 num_elems_processed_per_iteration = 1;
289 num_elems_horizontal_window = 1;
290 break;
291 default:
292 ARM_COMPUTE_ERROR("Element size not supported");
293 break;
294 }
295 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000296 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000297 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000298 if(is_nhwc)
299 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100300 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000301 }
302 }
303
304 bool window_changed = false;
305 Window win{};
306 if(data_layout == DataLayout::NCHW)
307 {
308 // Number of iterations in X dimension
309 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000310 // Upper limit for the number of right/bottom border elements that are accessed
311 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
312 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000313 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
314 border_size.right = std::max(upper_bound_w, pool_pad_right);
315 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000316 TensorShape output_shape{ input->tensor_shape() };
317 output_shape.set(0, pooled_w);
318 output_shape.set(1, pooled_h);
319 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000320 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000321 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000322 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000323 if(indices)
324 {
325 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
326 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
327 }
328 else
329 {
330 window_changed = update_window_and_padding(win, input_access, output_access);
331 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000332 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
333 }
334 else
335 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000336 TensorShape output_shape{ input->tensor_shape() };
337 output_shape.set(1, pooled_w);
338 output_shape.set(2, pooled_h);
339 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000340 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
341 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000342 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
morgolockcc1f6c92020-03-24 09:26:48 +0000343 if(indices)
344 {
345 AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
346 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
347 }
348 else
349 {
350 window_changed = update_window_and_padding(win, input_access, output_access);
351 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000352 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000353 }
354
355 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
356 return std::make_pair(err, win);
357}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000358
359template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000360inline T vcvtq_q32_f32(float32x4_t values);
361
362template <>
363inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
364{
365 return vcvtq_u32_f32(values);
366}
367
368template <>
369inline int32x4_t vcvtq_q32_f32(float32x4_t values)
370{
371 return vcvtq_s32_f32(values);
372}
373
374template <typename T>
375inline float32x4_t vcvtq_f32_q32(T values);
376
377template <>
378inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
379{
380 return vcvtq_f32_u32(values);
381}
382
383template <>
384inline float32x4_t vcvtq_f32_q32(int32x4_t values)
385{
386 return vcvtq_f32_s32(values);
387}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000388
389template <typename Tout>
390inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
391
392template <>
393inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
394{
395 const float new_scale = quant_rescale / scale_pooling;
396 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
397}
398
399template <>
400inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
401{
402 const float new_scale = quant_rescale / scale_pooling;
403 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
404}
405
406template <typename Tin, typename Tout>
407inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
408
409template <>
410inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
411{
412 const float32x4x4_t acc =
413 {
414 {
415 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
416 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
417 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
418 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
419 }
420 };
421 return vquantize(acc, requant_qinfo);
422}
423
424template <>
425inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
426{
427 const float32x4x4_t acc =
428 {
429 {
430 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
431 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
432 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
433 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
434 }
435 };
436 return vquantize_signed(acc, requant_qinfo);
437}
438
439template <typename T>
440inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
441
442template <>
443inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
444{
445 const float32x4x2_t acc =
446 {
447 {
448 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
449 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
450 }
451 };
452 return vquantize(acc, requant_qinfo);
453}
454
455template <>
456inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
457{
458 const float32x4x2_t acc =
459 {
460 {
461 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
462 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
463 }
464 };
465 return vquantize_signed(acc, requant_qinfo);
466}
467
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000468} // namespace
469
470NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000471 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000472{
473}
474
475BorderSize NEPoolingLayerKernel::border_size() const
476{
477 return _border_size;
478}
479
morgolockcc1f6c92020-03-24 09:26:48 +0000480void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000481{
482 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000483 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
484 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000485 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000486
487 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000488 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
489 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
490 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000491
492 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000493 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000494 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
495 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000496
497 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000498 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000499
500 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100501 unsigned int pooled_w;
502 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000503 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
504 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000505 pool_size.x(),
506 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000507 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000508
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000509 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000510 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100511
512 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000513 _input = input;
514 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000515 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000516 _pool_info = pool_info;
517 _data_layout = input->info()->data_layout();
518 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100519
Georgios Pinitas55186712018-01-08 17:37:12 +0000520 // Get data type
521 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000522 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000523
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100524 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000525 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000526 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000527 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000528 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100529 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000530 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000531 }
532 else
533 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000534 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000535 }
536 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000537 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000538 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000539 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000540 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000541 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000542 }
543 else
544 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000545 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000546 }
547 }
548 else
549 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000550 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000551 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000552 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000553 }
554 else
555 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000556 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
557 }
558 }
559 }
560 else if(data_type == DataType::QASYMM8_SIGNED)
561 {
562 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
563 {
564 if(is_nchw)
565 {
566 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
567 }
568 else
569 {
570 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
571 }
572 }
573 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
574 {
575 if(is_nchw)
576 {
577 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
578 }
579 else
580 {
581 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
582 }
583 }
584 else
585 {
586 if(is_nchw)
587 {
588 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
589 }
590 else
591 {
592 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000593 }
594 }
595 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000596 else if(data_type == DataType::F16)
597 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000598 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000599 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000600 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000601 {
602 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000603 {
604 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000605 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000606 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000607 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000608 else
609 {
610 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
611 }
612 }
613 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000614 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000615 {
616 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000617 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000618 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000619 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000620 else
621 {
622 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
623 }
624 }
625 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000626 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000627 {
628 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000629 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000630 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
631 }
632 else
633 {
634 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000635 }
636 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000637 }
638 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000639 }
640 }
641 else
642 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000643 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000644 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000645 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
646 }
647 else
648 {
649 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000650 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000651 }
652 }
653 else if(data_type == DataType::F32)
654 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000655 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000656 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000657 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000658 {
659 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000660 {
661 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000662 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000663 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
664 }
665 else
666 {
667 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000668 }
669 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000670 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000671 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000672 {
673 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000674 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000675 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
676 }
677 else
678 {
679 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000680 }
681 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000682 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000683 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000684 {
685 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000686 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000687 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
688 }
689 else
690 {
691 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000692 }
693 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000694 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000695 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000696 {
697 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000698 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000699 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
700 }
701 else
702 {
703 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000704 }
705 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000706 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000707 }
708 }
709 else
710 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000711 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000712 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000713 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
714 }
715 else
716 {
717 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000718 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000719 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100720 }
721
722 // Configure kernel window
morgolockcc1f6c92020-03-24 09:26:48 +0000723 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
724 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000725 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
726 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100727}
728
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000729template <typename T>
730void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000731{
732 Iterator input(_input, window_input);
733 Iterator output(_output, window);
734
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000735 /** NEON vector types */
736 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
737 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
738 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
739 using q16_t = typename wrapper::traits::promote_t<T>;
740 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
741 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
742 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
743
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000744 constexpr int pool_size = 2;
745 int pool_stride_x = 0;
746 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000747 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
748 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
749 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
750 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
751 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000752 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
753 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000754
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000755 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
756 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000757
758 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
759
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100760 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
761 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
762 const bool have_different_qinfo = input_qinfo != output_qinfo;
763
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000764 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
765 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
766 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
767
Georgios Pinitas55186712018-01-08 17:37:12 +0000768 execute_window_loop(window, [&](const Coordinates & id)
769 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000770 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
771 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
772 q8x8_t lower_res = {};
773 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000774
775 if(pooling_type != PoolingType::MAX)
776 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000777 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
778 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000779
780 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000781 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000782 {
783 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000784 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
785 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000786 }
787 };
788
789 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000790 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
791 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000792
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000793 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000794
795 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000796 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
797 pool_size, upper_bound_w, upper_bound_h,
798 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
799 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000800
801 // Compute upper result for stride_x == 1
802 if(pool_stride_x == 1)
803 {
804 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000805 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000806 {
807 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000808 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
809 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000810 }
811 };
812
813 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000814 q16x8_t res_upper = wrapper::vcombine(
815 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
816 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000817
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000818 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000819 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
820 pool_size, upper_bound_w, upper_bound_h,
821 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
822 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000823 }
824 }
825 else
826 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000827 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
828 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000829 if(pool_stride_x == 1)
830 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000831 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
832 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000833 }
834 }
835
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100836 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100837 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000838 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000839 lower_res = wrapper::vgetlow(requantized_output);
840 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100841 }
842
Georgios Pinitas55186712018-01-08 17:37:12 +0000843 // Store result
844 if(pool_stride_x == 1)
845 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000846 const q8x8x2_t res = { { lower_res, upper_res } };
847 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000848 }
849 else
850 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000851 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000852 }
853 },
854 input, output);
855}
856
Pablo Tello77e6c552018-12-04 15:33:49 +0000857void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100858{
Pablo Tello77e6c552018-12-04 15:33:49 +0000859 ARM_COMPUTE_UNUSED(pooling_type);
860 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000861#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100862 Iterator input(_input, window_input);
863 Iterator output(_output, window);
864
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000865 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000866 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
867 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
868 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
869 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000870 int pool_stride_x = 0;
871 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000872 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000873 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
874 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100875
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000876 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
877 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
878 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100879
880 execute_window_loop(window, [&](const Coordinates & id)
881 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100882 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
883 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
884 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
885 float16x4_t res = {};
886
887 // Get power of 2 in case of l2 pooling
888 if(pooling_type == PoolingType::L2)
889 {
890 top_data = vmul_f16(top_data, top_data);
891 middle_data = vmul_f16(middle_data, middle_data);
892 bottom_data = vmul_f16(bottom_data, bottom_data);
893 }
894
895 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100896 {
897 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000898 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100899 const float16x4_t scale_v = vdup_n_f16(scale);
900 // Perform pooling
901 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
902 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
903 res = vmul_f16(vpadd_f16(res, res), scale_v);
904 }
905 else
906 {
907 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
908 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
909 res = vpmax_f16(res, res);
910 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100911
912 // Calculate square-root in case of l2 pooling
913 if(pooling_type == PoolingType::L2)
914 {
915 res = vinv_f16(vinvsqrt_f16(res));
916 }
917
Pablo Tello0c34fe22017-06-26 17:17:42 +0100918 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
919 },
920 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000921#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100922 ARM_COMPUTE_UNUSED(window_input);
923 ARM_COMPUTE_UNUSED(window);
924 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000925#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100926}
927
Pablo Tello77e6c552018-12-04 15:33:49 +0000928void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100929{
Pablo Tello77e6c552018-12-04 15:33:49 +0000930 ARM_COMPUTE_UNUSED(pooling_type);
931 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000932#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100933 Iterator input(_input, window_input);
934 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000935 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000936 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
937 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
938 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
939 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000940 int pool_stride_x, pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000941 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000942 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
943 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100944
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000945 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
946 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100947
948 execute_window_loop(window, [&](const Coordinates & id)
949 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100950 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
951 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
952 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100953
Georgios Pinitascdf51452017-08-31 14:21:36 +0100954 // Get power of 2 in case of l2 pooling
955 if(pooling_type == PoolingType::L2)
956 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100957 top_data = vmul_f16(top_data, top_data);
958 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100959 }
960
961 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100962 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000963 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100964 const float16x4_t scale_v = vdup_n_f16(scale);
965
966 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
967 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100968 }
969 else
970 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100971 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
972 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100973 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100974
975 // Calculate square-root in case of l2 pooling
976 if(pooling_type == PoolingType::L2)
977 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100978 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100979 }
980
981 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100982 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100983 },
984 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000985#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100986 ARM_COMPUTE_UNUSED(window_input);
987 ARM_COMPUTE_UNUSED(window);
988 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000989#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100990}
991
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000992template <typename T>
993void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000994{
995 Iterator input(_input, window_input);
996 Iterator output(_output, window);
997
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000998 /** NEON vector types */
999 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1000 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1001 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1002 using q16_t = typename wrapper::traits::promote_t<T>;
1003 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1004 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1005
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001006 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001007 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1008 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1009 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1010 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001011 int pool_stride_x = 0;
1012 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001013 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001014 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1015 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001016
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001017 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1018 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001019
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001020 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1021 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1022 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1023
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001024 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1025 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1026 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001027
1028 execute_window_loop(window, [&](const Coordinates & id)
1029 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001030 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1031 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1032 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1033 q8x8_t fres = {};
1034 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001035
1036 if(pooling_type == PoolingType::AVG)
1037 {
1038 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001039 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1040 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1041 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001042
1043 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001044 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001045 {
1046 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001047 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1048 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001049 }
1050 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001051 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001052 {
1053 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001054 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1055 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001056 }
1057 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001058 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001059 {
1060 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001061 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1062 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001063 }
1064 };
1065 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001066 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001067 {
1068 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001069 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1070 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001071 }
1072 };
1073 if(pool_stride_x == 2)
1074 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001075 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001076 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001077 wrapper::vgetlane(final_sum.val[0], 0),
1078 wrapper::vgetlane(final_sum.val[0], 2),
1079 wrapper::vgetlane(final_sum.val[0], 4),
1080 wrapper::vgetlane(final_sum.val[0], 6),
1081 wrapper::vgetlane(final_sum.val[1], 0),
1082 wrapper::vgetlane(final_sum.val[1], 2),
1083 wrapper::vgetlane(final_sum.val[1], 4),
1084 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001085 };
1086
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001087 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1088 pool_size, upper_bound_w, upper_bound_h,
1089 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1090 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001091 }
1092 else
1093 {
1094 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001095 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1096 pool_size, upper_bound_w, upper_bound_h,
1097 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001098 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001099 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1100 pool_size, upper_bound_w, upper_bound_h,
1101 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1102 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001103 }
1104 }
1105 else
1106 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001107 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1108 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1109 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1110 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001111
1112 if(pool_stride_x == 2)
1113 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001114 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1115 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1116 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001117 }
1118 else
1119 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001120 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001121 }
1122 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001123
1124 // Store result
1125 if(pool_stride_x == 1)
1126 {
1127 if(input_qinfo != output_qinfo)
1128 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001129 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001130 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001131 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001132 }
1133 else
1134 {
1135 if(input_qinfo != output_qinfo)
1136 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001137 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001138 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001139 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001140 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001141 },
1142 input, output);
1143}
1144
Pablo Tello77e6c552018-12-04 15:33:49 +00001145void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001146{
Pablo Tello77e6c552018-12-04 15:33:49 +00001147 ARM_COMPUTE_UNUSED(pooling_type);
1148 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001149#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1150 Iterator input(_input, window_input);
1151 Iterator output(_output, window);
1152
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001153 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1154 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1155 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1156 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1157 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1158 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001159 int pool_stride_x = 0;
1160 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001161 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001162 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1163 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1164
1165 execute_window_loop(window, [&](const Coordinates & id)
1166 {
1167 float16_t res = 0.0f;
1168 float16x8_t vres = vdupq_n_f16(0.0f);
1169
1170 if(pooling_type != PoolingType::MAX)
1171 {
1172 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001173 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001174
1175 // Perform pooling
1176
1177 for(int y = 0; y < pool_size_y; ++y)
1178 {
1179 int x = 0;
1180 for(; x <= (pool_size_x - 8); x += 8)
1181 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001182 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1183 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001184
1185 // Get power of 2 in case of l2 pooling and accumulate
1186 if(pooling_type == PoolingType::L2)
1187 {
1188 vres = vaddq_f16(vres, vmulq_f16(data, data));
1189 }
1190 else
1191 {
1192 vres = vaddq_f16(vres, data);
1193 }
1194 }
1195
1196 // Leftover for loop
1197 for(; x < pool_size_x; ++x)
1198 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001199 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1200 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001201
1202 // Get power of 2 in case of l2 pooling
1203 if(pooling_type == PoolingType::L2)
1204 {
1205 data *= data;
1206 }
1207
1208 res += data;
1209 }
1210 }
1211
1212 // Reduction
1213 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1214 res += vget_lane_f16(tmp, 0);
1215 res += vget_lane_f16(tmp, 1);
1216 res += vget_lane_f16(tmp, 2);
1217 res += vget_lane_f16(tmp, 3);
1218
1219 // Divide by scale
1220 res *= scale;
1221 }
1222 else
1223 {
1224 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1225 res = std::numeric_limits<float>::lowest();
1226
1227 for(int y = 0; y < pool_size_y; ++y)
1228 {
1229 int x = 0;
1230 for(; x <= (pool_size_x - 8); x += 8)
1231 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001232 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1233 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001234 vres = vmaxq_f16(vres, data);
1235 }
1236
1237 // Leftover for loop
1238 for(; x < pool_size_x; ++x)
1239 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001240 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1241 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1242 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001243 }
1244 }
1245
1246 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1247 res = std::max(res, vget_lane_f16(tmp, 0));
1248 res = std::max(res, vget_lane_f16(tmp, 1));
1249 res = std::max(res, vget_lane_f16(tmp, 2));
1250 res = std::max(res, vget_lane_f16(tmp, 3));
1251 }
1252
1253 // Calculate square-root in case of l2 pooling
1254 if(pooling_type == PoolingType::L2)
1255 {
1256 res = std::sqrt(res);
1257 }
1258
1259 // Store result
1260 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1261 },
1262 input, output);
1263
1264#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1265 ARM_COMPUTE_UNUSED(window_input);
1266 ARM_COMPUTE_UNUSED(window);
1267 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1268#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1269}
1270
Pablo Tello77e6c552018-12-04 15:33:49 +00001271void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001272{
Pablo Tello77e6c552018-12-04 15:33:49 +00001273 ARM_COMPUTE_UNUSED(pooling_type);
1274 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001275#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1276 Iterator input(_input, window_input);
1277 Iterator output(_output, window);
1278
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001279 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1280 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1281 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1282 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1283 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1284 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001285 int pool_stride_x = 0;
1286 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001287 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001288 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1289 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1290
1291 float16x8_t vres;
1292
1293 execute_window_loop(window, [&](const Coordinates & id)
1294 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001295 const int idx_width = id.y() * pool_stride_x;
1296 const int idx_height = id.z() * pool_stride_y;
1297 const int pool_limit_y = pool_pad_top - idx_height;
1298 const int pool_limit_x = pool_pad_left - idx_width;
1299
1300 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1301 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1302 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1303 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1304
Michalis Spyrou57dac842018-03-01 16:03:50 +00001305 if(pooling_type != PoolingType::MAX)
1306 {
1307 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001308 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1309 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001310 const float16x8_t scale_v = vdupq_n_f16(scale);
1311
1312 // Perform pooling
1313 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001314 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001315 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001316 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001317 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001318 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1319 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001320
1321 // Get power of 2 in case of l2 pooling and accumulate
1322 if(pooling_type == PoolingType::L2)
1323 {
1324 vres = vaddq_f16(vres, vmulq_f16(data, data));
1325 }
1326 else
1327 {
1328 vres = vaddq_f16(vres, data);
1329 }
1330 }
1331 }
1332 // Divide by scale
1333 vres = vmulq_f16(vres, scale_v);
1334 }
1335 else
1336 {
1337 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001338
1339 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001340 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001341 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001342 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001343 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1344 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001345 vres = vmaxq_f16(vres, data);
1346 }
1347 }
1348 }
1349
1350 // Calculate square-root in case of l2 pooling
1351 if(pooling_type == PoolingType::L2)
1352 {
1353 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1354 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1355 }
1356
1357 // Store result
1358 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1359 },
1360 input, output);
1361
1362#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1363 ARM_COMPUTE_UNUSED(window_input);
1364 ARM_COMPUTE_UNUSED(window);
1365 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1366#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1367}
1368
Pablo Tello77e6c552018-12-04 15:33:49 +00001369void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001370{
1371 Iterator input(_input, window_input);
1372 Iterator output(_output, window);
1373
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001374 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1375 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1376 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1377 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1378 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1379 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001380 int pool_stride_x = 0;
1381 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001382 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001383 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1384 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001385
1386 execute_window_loop(window, [&](const Coordinates & id)
1387 {
1388 float res = 0.0f;
1389
1390 if(pooling_type != PoolingType::MAX)
1391 {
1392 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001393 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001394
1395 // Perform pooling
1396 float32x4_t vres = vdupq_n_f32(0.0f);
1397
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001398 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001399 {
1400 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001401 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001402 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001403 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1404 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001405
1406 // Get power of 2 in case of l2 pooling and accumulate
1407 if(pooling_type == PoolingType::L2)
1408 {
1409 vres = vmlaq_f32(vres, data, data);
1410 }
1411 else
1412 {
1413 vres = vaddq_f32(vres, data);
1414 }
1415 }
1416
1417 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001418 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001419 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001420 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1421 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001422
1423 // Get power of 2 in case of l2 pooling
1424 if(pooling_type == PoolingType::L2)
1425 {
1426 data *= data;
1427 }
1428
1429 res += data;
1430 }
1431 }
1432
1433#if defined(__aarch64__)
1434 // Reduction operation available on 64 bit architectures only
1435 res += vaddvq_f32(vres);
1436#else // __aarch64__
1437 // Reduction
1438 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1439 tmp = vpadd_f32(tmp, tmp);
1440
1441 res += vget_lane_f32(tmp, 0);
1442#endif // __aarch64__
1443 // Divide by scale
1444 res *= scale;
1445 }
1446 else
1447 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001448 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1449 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001450
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001451 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001452 {
1453 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001454 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001455 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001456 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1457 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001458 vres = vmaxq_f32(vres, data);
1459 }
1460
1461 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001462 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001463 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001464 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1465 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001466 res = std::max(res, data);
1467 }
1468 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001469#if defined(__aarch64__)
1470 // Reduction operation available on 64 bit architectures only
1471 res = std::max(vmaxvq_f32(vres), res);
1472#else // __aarch64__
1473 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1474 tmp = vpmax_f32(tmp, tmp);
1475
1476 res = std::max(res, vget_lane_f32(tmp, 0));
1477#endif // __aarch64__
1478 }
1479
1480 // Calculate square-root in case of l2 pooling
1481 if(pooling_type == PoolingType::L2)
1482 {
1483 res = std::sqrt(res);
1484 }
1485
1486 // Store result
1487 *(reinterpret_cast<float *>(output.ptr())) = res;
1488 },
1489 input, output);
1490}
1491
morgolockcc1f6c92020-03-24 09:26:48 +00001492void NEPoolingLayerKernel::pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window)
Pablo Tello77e6c552018-12-04 15:33:49 +00001493{
morgolockcc1f6c92020-03-24 09:26:48 +00001494 Iterator input(_input, window_input);
1495 Iterator output(_output, window);
1496 Iterator indices(_indices, window);
1497 int final_index = 0;
1498 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1499 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1500 int pool_stride_x = 0;
1501 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001502 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001503 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1504 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1505
morgolockcc1f6c92020-03-24 09:26:48 +00001506 const Strides &input_strides = _input->info()->strides_in_bytes();
1507 const auto in_stridew = input_strides[1];
1508
1509 execute_window_loop(window, [&](const Coordinates &)
Pablo Tello77e6c552018-12-04 15:33:49 +00001510 {
morgolockcc1f6c92020-03-24 09:26:48 +00001511 const auto input_offset_top = input_top_ptr + input.offset();
1512 const auto input_offset_bottom = input_bottom_ptr + input.offset();
1513 const auto in_top_ptr = reinterpret_cast<const float *>(input_offset_top);
1514 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_offset_bottom);
1515 float32x2_t top_data = vld1_f32(in_top_ptr);
1516 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1517 float32x2_t res = {};
1518 float final_res = 0;
1519 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1520 res = vpmax_f32(max_data, max_data);
1521 final_res = vget_lane_f32(res, 0);
Pablo Tello77e6c552018-12-04 15:33:49 +00001522 // Store result
1523 *(reinterpret_cast<float *>(output.ptr())) = final_res;
morgolockcc1f6c92020-03-24 09:26:48 +00001524 const uint32_t offset_top = (uint32_t)(input.offset() / sizeof(float));
1525 const uint32_t offset_bottom = (uint32_t)offset_top + (in_stridew / sizeof(float));
1526 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
1527 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
1528 const uint32x2_t tmp_indices = vbsl_u32(vcgt_f32(top_data, bottom_data), voffset_top, voffset_bottom);
1529 final_index = vget_lane_u32(vbsl_u32(vcgt_f32(max_data, vrev64_f32(max_data)), tmp_indices, vrev64_u32(tmp_indices)), 0);
1530 *(reinterpret_cast<int *>(indices.ptr())) = final_index;
Pablo Tello77e6c552018-12-04 15:33:49 +00001531 },
morgolockcc1f6c92020-03-24 09:26:48 +00001532 input, output, indices);
1533}
1534
1535void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1536 bool exclude_padding)
1537{
1538 if(pooling_type == PoolingType::MAX && _indices)
1539 {
1540 pooling2_f32_nchw_maxpool_indices(window_input, window);
1541 }
1542 else
1543 {
1544 Iterator input(_input, window_input);
1545 Iterator output(_output, window);
1546 constexpr int pool_size = 2;
1547 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1548 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1549 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1550 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1551 int pool_stride_x = 0;
1552 int pool_stride_y = 0;
1553 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1554 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1555 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1556
1557 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1558 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1559
1560 execute_window_loop(window, [&](const Coordinates & id)
1561 {
1562 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1563 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1564 float32x2_t top_data = vld1_f32(in_top_ptr);
1565 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1566 float32x2_t res = {};
1567 float final_res = 0;
1568 // Get power of 2 in case of l2 pooling
1569 if(pooling_type == PoolingType::L2)
1570 {
1571 top_data = vmul_f32(top_data, top_data);
1572 bottom_data = vmul_f32(bottom_data, bottom_data);
1573 }
1574
1575 if(pooling_type != PoolingType::MAX)
1576 {
1577 // Calculate scale
1578 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1579 const float32x2_t scale_v = vdup_n_f32(scale);
1580
1581 // Perform pooling
1582 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1583 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1584 }
1585 else
1586 {
1587 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1588 res = vpmax_f32(max_data, max_data);
1589 }
1590 final_res = vget_lane_f32(res, 0);
1591
1592 // Calculate square-root in case of l2 pooling
1593 if(pooling_type == PoolingType::L2)
1594 {
1595 final_res = sqrt(final_res);
1596 }
1597
1598 // Store result
1599 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1600 },
1601 input, output);
1602 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001603}
1604
1605void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1606{
1607 Iterator input(_input, window_input);
1608 Iterator output(_output, window);
1609
1610 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001611 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1612 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1613 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1614 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001615 int pool_stride_x = 0;
1616 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001617 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001618 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1619 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1620
1621 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1622 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1623 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1624
1625 execute_window_loop(window, [&](const Coordinates & id)
1626 {
1627 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1628 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1629 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1630 float32x2_t res = {};
1631 float final_res = 0;
1632
1633 // Get power of 2 in case of l2 pooling
1634 if(pooling_type == PoolingType::L2)
1635 {
1636 top_data = vmulq_f32(top_data, top_data);
1637 middle_data = vmulq_f32(middle_data, middle_data);
1638 bottom_data = vmulq_f32(bottom_data, bottom_data);
1639 }
1640
1641 if(pooling_type != PoolingType::MAX)
1642 {
1643 // Calculate scale
1644 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1645 const float32x2_t scale_v = vdup_n_f32(scale);
1646
1647 // Perform pooling
1648 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1649 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1650 res = vmul_f32(vpadd_f32(res, res), scale_v);
1651 }
1652 else
1653 {
1654 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1655 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1656 res = vpmax_f32(res, res);
1657 }
1658 final_res = vget_lane_f32(res, 0);
1659
1660 // Calculate square-root in case of l2 pooling
1661 if(pooling_type == PoolingType::L2)
1662 {
1663 final_res = sqrt(final_res);
1664 }
1665
1666 // Store result
1667 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1668 },
1669 input, output);
1670}
1671
1672void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1673{
1674 Iterator input(_input, window_input);
1675 Iterator output(_output, window);
1676
1677 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001678 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1679 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1680 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1681 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001682 int pool_stride_x = 0;
1683 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001684 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001685 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1686 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1687
1688 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1689 for(int i = 0; i < pool_size; ++i)
1690 {
1691 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1692 }
1693
1694 execute_window_loop(window, [&](const Coordinates & id)
1695 {
1696 float32x2_t res = {};
1697 float final_res = 0.f;
1698 if(pooling_type != PoolingType::MAX)
1699 {
1700 // Calculate scale
1701 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1702 const float32x2_t scale_v = vdup_n_f32(scale);
1703
1704 // Perform pooling
1705 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1706 // Get power of 2 in case of l2 pooling
1707 if(pooling_type == PoolingType::L2)
1708 {
1709 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1710 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1711 }
1712 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1713 for(int i = 1; i < pool_size; ++i)
1714 {
1715 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1716 // Get power of 2 in case of l2 pooling
1717 if(pooling_type == PoolingType::L2)
1718 {
1719 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1720 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1721 }
1722 sum_data = vaddq_f32(sum_data, data.val[0]);
1723 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1724 }
1725 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1726 res = vmul_f32(vpadd_f32(res, res), scale_v);
1727 }
1728 else
1729 {
1730 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1731 for(int i = 1; i < pool_size; ++i)
1732 {
1733 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1734 max_data = vmax2q_f32(max_data, data);
1735 }
1736 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1737 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1738 res = vpmax_f32(res, res);
1739 }
1740 final_res = vget_lane_f32(res, 0);
1741
1742 // Calculate square-root in case of l2 pooling
1743 if(pooling_type == PoolingType::L2)
1744 {
1745 final_res = sqrt(final_res);
1746 }
1747
1748 // Store result
1749 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1750 },
1751 input, output);
1752}
1753
1754void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001755{
morgolocke383c352020-04-03 16:57:46 +01001756 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1757 {
1758 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1759 }
1760 else
1761 {
1762 Iterator input(_input, window_input);
1763 Iterator output(_output, window);
1764
1765 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1766 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1767 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1768 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1769 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1770 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1771 int pool_stride_x = 0;
1772 int pool_stride_y = 0;
1773 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1774 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1775 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1776
1777 float32x4_t vres;
1778
1779 execute_window_loop(window, [&](const Coordinates & id)
1780 {
1781 const int idx_width = id.y() * pool_stride_x;
1782 const int idx_height = id.z() * pool_stride_y;
1783 const int pool_limit_y = pool_pad_top - idx_height;
1784 const int pool_limit_x = pool_pad_left - idx_width;
1785
1786 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1787 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1788 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1789 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1790
1791 if(pooling_type != PoolingType::MAX)
1792 {
1793 // Calculate scale
1794 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1795 pool_stride_y);
1796 const float32x4_t scale_v = vdupq_n_f32(scale);
1797
1798 // Perform pooling
1799 vres = vdupq_n_f32(0.0f);
1800
1801 for(int y = pool_start_y; y < pool_end_y; ++y)
1802 {
1803 for(int x = pool_start_x; x < pool_end_x; ++x)
1804 {
1805 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1806 (_input->info()->strides_in_bytes().z())));
1807
1808 // Get power of 2 in case of l2 pooling and accumulate
1809 if(pooling_type == PoolingType::L2)
1810 {
1811 vres = vmlaq_f32(vres, data, data);
1812 }
1813 else
1814 {
1815 vres = vaddq_f32(vres, data);
1816 }
1817 }
1818 }
1819 // Divide by scale
1820 vres = vmulq_f32(vres, scale_v);
1821 }
1822 else
1823 {
1824 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1825 for(int y = pool_start_y; y < pool_end_y; ++y)
1826 {
1827 for(int x = pool_start_x; x < pool_end_x; ++x)
1828 {
1829 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1830 (_input->info()->strides_in_bytes().z())));
1831 vres = vmaxq_f32(vres, data);
1832 }
1833 }
1834 }
1835
1836 // Calculate square-root in case of l2 pooling
1837 if(pooling_type == PoolingType::L2)
1838 {
1839 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1840 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1841 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1842 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1843 };
1844 vres = l2_res;
1845 }
1846
1847 // Store result
1848 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1849 },
1850 input, output);
1851 }
1852}
1853
1854void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1855{
Michalis Spyrou57dac842018-03-01 16:03:50 +00001856 Iterator input(_input, window_input);
1857 Iterator output(_output, window);
morgolocke383c352020-04-03 16:57:46 +01001858 Iterator indices(_indices, window);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001859
morgolocke383c352020-04-03 16:57:46 +01001860 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1861 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1862
1863 int pool_stride_x = 0;
1864 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001865 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001866
1867 float32x4_t vres;
1868
morgolocke383c352020-04-03 16:57:46 +01001869 const int pad_right = _input->info()->padding().right;
1870 const int pad_top = _input->info()->padding().top;
1871 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1872 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1873 const int in_stride_w = static_cast<int>(_input->info()->strides_in_bytes()[3]);
1874
Michalis Spyrou57dac842018-03-01 16:03:50 +00001875 execute_window_loop(window, [&](const Coordinates & id)
1876 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001877 const int idx_width = id.y() * pool_stride_x;
1878 const int idx_height = id.z() * pool_stride_y;
1879 const int pool_limit_y = pool_pad_top - idx_height;
1880 const int pool_limit_x = pool_pad_left - idx_width;
1881
1882 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
Michalis Spyrouced25572018-10-01 16:26:20 +01001883 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
morgolocke383c352020-04-03 16:57:46 +01001884 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1885 (_input->info()->strides_in_bytes().z());
1886 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1887 (_input->info()->strides_in_bytes().z());
Michalis Spyrouced25572018-10-01 16:26:20 +01001888
morgolocke383c352020-04-03 16:57:46 +01001889 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1890 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00001891
morgolocke383c352020-04-03 16:57:46 +01001892 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1893 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00001894
morgolocke383c352020-04-03 16:57:46 +01001895 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
1896 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
1897 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
1898 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
1899 const auto v_x0 = vld1q_f32(in_x0_ptr);
1900 const auto v_x1 = vld1q_f32(in_x1_ptr);
1901 const auto v_x2 = vld1q_f32(in_x2_ptr);
1902 const auto v_x3 = vld1q_f32(in_x3_ptr);
1903 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001904 // Store result
1905 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
morgolocke383c352020-04-03 16:57:46 +01001906
1907 const uint32_t offset_base = input.offset()
1908 - sizeof(float) * pad_right * id.y() * pool_stride_x /* subtract padding elems per row */
1909 - pad_top * sizeof(float) /* top padding */
1910 - sizeof(float) * pad_right * _input->info()->tensor_shape()[1] * id.z() * pool_stride_y /* for each Z plane there are width*pad_right padding elems */
1911 - in_stride_w * id[3] + _input->info()->tensor_shape()[0] * sizeof(float) * id[3];
1912
1913 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float);
1914 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
1915 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
1916 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
1917
1918 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1919 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1920 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1921 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1922 const uint32x4_t tmp_indices0 = vbslq_u32(vcgtq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
1923 const uint32x4_t tmp_indices1 = vbslq_u32(vcgtq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
1924 const uint32x4_t tmp_indices2 = vbslq_u32(vcgtq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1925
1926 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indices2);
1927
Michalis Spyrou57dac842018-03-01 16:03:50 +00001928 },
morgolocke383c352020-04-03 16:57:46 +01001929 input, output, indices);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001930}
1931
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001932template <typename T>
1933void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001934{
1935 Iterator input(_input, window_input);
1936 Iterator output(_output, window);
1937
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001938 /** NEON vector types */
1939 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1940 using q16_t = typename wrapper::traits::promote_t<T>;
1941 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1942 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1943 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1944
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001945 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1946 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1947 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1948 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1949 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1950 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001951 int pool_stride_x = 0;
1952 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001953 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001954 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1955 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001956
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001957 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1958 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1959
Georgios Pinitas55186712018-01-08 17:37:12 +00001960 execute_window_loop(window, [&](const Coordinates & id)
1961 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001962 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00001963
1964 if(pooling_type != PoolingType::MAX)
1965 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001966 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1967 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001968
1969 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001970 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001971
1972 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001973 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001974 {
1975 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001976 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001977 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001978 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1979 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001980
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001981 const q16x8_t data_q16 = wrapper::vmovl(data);
1982 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001983 }
1984
1985 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001986 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001987 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001988 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1989 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001990 sres += data;
1991 }
1992 }
1993
1994 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001995 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
1996 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00001997
1998 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001999 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00002000 }
2001 else
2002 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002003 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00002004
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002005 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002006 {
2007 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002008 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002009 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002010 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2011 (_input->info()->strides_in_bytes().y())));
2012 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002013 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002014 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002015 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002016 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002017 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2018 (_input->info()->strides_in_bytes().y())));
2019 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002020 }
2021 }
2022
2023 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002024 vres = wrapper::vpmax(vres, vres);
2025 vres = wrapper::vpmax(vres, vres);
2026 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00002027
2028 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002029 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00002030 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002031 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002032 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2033 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00002034 },
2035 input, output);
2036}
2037
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002038template <typename T>
2039void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002040{
2041 Iterator input(_input, window_input);
2042 Iterator output(_output, window);
2043
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002044 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2045 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2046 using q16_t = typename wrapper::traits::promote_t<T>;
2047 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2048 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2049 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2050
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002051 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2052 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2053 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2054 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2055 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2056 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002057
2058 int pool_stride_x = 0;
2059 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002060 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002061 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2062 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2063
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002064 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2065 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2066 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00002067
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002068 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002069 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2070 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002071 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002072
2073 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2074 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2075 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2076
Michalis Spyrou57dac842018-03-01 16:03:50 +00002077 execute_window_loop(window, [&](const Coordinates & id)
2078 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002079 const int idx_width = id.y() * pool_stride_x;
2080 const int idx_height = id.z() * pool_stride_y;
2081 const int pool_limit_y = pool_pad_top - idx_height;
2082 const int pool_limit_x = pool_pad_left - idx_width;
2083
2084 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2085 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2086 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2087 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2088
Michalis Spyrou57dac842018-03-01 16:03:50 +00002089 if(pooling_type != PoolingType::MAX)
2090 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002091 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2092 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2093 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2094 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002095
2096 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002097 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2098 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002099
2100 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01002101 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002102 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002103 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002104 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002105 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2106 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002107
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002108 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2109 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2110 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2111 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2112 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2113 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002114 }
2115 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002116
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002117 if(input_qinfo != output_qinfo)
2118 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002119 const float32x4x4_t vres =
2120 {
2121 {
2122 vcvtq_f32_q32(vres1),
2123 vcvtq_f32_q32(vres2),
2124 vcvtq_f32_q32(vres3),
2125 vcvtq_f32_q32(vres4),
2126 }
2127 };
2128 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2129 // Store result
2130 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
2131 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002132 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002133 else
2134 {
2135 const float32x4_t scale_v = vdupq_n_f32(scale);
2136 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2137 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2138 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2139 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2140 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002141
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002142 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2143 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2144 // Store result
2145 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
2146 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
2147 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002148 }
2149 else
2150 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002151 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002152
Michalis Spyrouced25572018-10-01 16:26:20 +01002153 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002154 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002155 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002156 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002157 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2158 (_input->info()->strides_in_bytes().z())));
2159 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002160 }
2161 }
2162
2163 // Store result
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002164 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002165 }
2166 },
2167 input, output);
2168}
2169
morgolockcc1f6c92020-03-24 09:26:48 +00002170Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002171{
2172 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2173
2174 unsigned int pooled_w = 0;
2175 unsigned int pooled_h = 0;
2176 unsigned int num_elems_processed_per_iteration = 0;
2177 BorderSize border_size(0);
2178
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002179 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002180 unsigned int pool_size_x = 0;
2181 unsigned int pool_size_y = 0;
2182
2183 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002184 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2185 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2186 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002187
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002188 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2189 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002190
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002191 // Validate pool info before calling scaled_dimensions
2192 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002193
2194 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002195 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2196 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002197 pool_size_x,
2198 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002199 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002200
morgolockcc1f6c92020-03-24 09:26:48 +00002201 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2202 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2203 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002204 pool_size_x, pool_size_y)
2205 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002206
2207 return Status{};
2208}
2209
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002210void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002211{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002212 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002213 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2214 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2215 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2216
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002217 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2218 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2219 const unsigned int pool_size = _pool_info.pool_size.width;
2220 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002221
Michalis Spyrou57dac842018-03-01 16:03:50 +00002222 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002223 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002224 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002225 // Set step for input in x and y direction for the input
2226 unsigned int window_x_inc = 0;
2227 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002228 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002229 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002230 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002231 {
2232 window_x_inc = pool_stride_x;
2233 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2234 {
2235 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2236 }
2237 break;
2238 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002239
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002240 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002241 case DataType::F32:
2242 {
2243 window_x_inc = pool_stride_x;
2244 break;
2245 }
2246 default:
2247 {
2248 ARM_COMPUTE_ERROR("Not supported");
2249 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002250 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002251 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2252 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002253 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002254 else
2255 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002256 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002257 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2258 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2259 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002260
2261 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002262 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002263}
morgolockcc1f6c92020-03-24 09:26:48 +00002264} // namespace arm_compute