blob: 75b33f2e9098a180af0657662642b9f075cbafb6 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002 * Copyright (c) 2017-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Georgios Pinitas55186712018-01-08 17:37:12 +000040#include "support/ToolchainSupport.h"
41
Manuel Bottinib4bb8272019-12-18 18:01:27 +000042#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <algorithm>
44#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010045#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010047#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048#include <string>
49#include <tuple>
50
Manuel Bottinib4bb8272019-12-18 18:01:27 +000051namespace arm_compute
52{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010053using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054
55namespace
56{
Pablo Tello77e6c552018-12-04 15:33:49 +000057inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
59{
Michalis Spyrou57dac842018-03-01 16:03:50 +000060 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
61 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
62
63 int start_x = id[idx_width] * stride_x - pad_x;
64 int start_y = id[idx_height] * stride_y - pad_y;
65
66 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
67 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000068 if(exclude_padding)
69 {
70 start_x = std::max(0, start_x);
71 start_y = std::max(0, start_y);
72 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 return 1.f / ((end_y - start_y) * (end_x - start_x));
74}
75
Manuel Bottinib4bb8272019-12-18 18:01:27 +000076template <typename T, typename TVec>
77inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000078 const int pool_size, const int upper_bound_w, const int upper_bound_h,
79 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
80{
81 int start_x = (id.x() + id_offset) * stride_x - pad_x;
82 int start_y = id.y() * stride_y - pad_y;
83 const int end_y = std::min(start_y + pool_size, upper_bound_h);
84 if(exclude_padding)
85 {
86 start_y = std::max(0, start_y);
87 }
88
Manuel Bottinib4bb8272019-12-18 18:01:27 +000089 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000090 {
91 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000092 wrapper::vgetlane(v, 0),
93 wrapper::vgetlane(v, 1),
94 wrapper::vgetlane(v, 2),
95 wrapper::vgetlane(v, 3),
96 wrapper::vgetlane(v, 4),
97 wrapper::vgetlane(v, 5),
98 wrapper::vgetlane(v, 6),
99 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000100 }
101 };
102
103 for(auto &el : elems)
104 {
105 int c_start_x = start_x;
106 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
107 if(exclude_padding)
108 {
109 c_start_x = std::max(0, c_start_x);
110 }
111 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
112 el *= scale;
113 start_x += step * stride_x;
114 }
115
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000116 v = wrapper::vsetlane(elems[0], v, 0);
117 v = wrapper::vsetlane(elems[1], v, 1);
118 v = wrapper::vsetlane(elems[2], v, 2);
119 v = wrapper::vsetlane(elems[3], v, 3);
120 v = wrapper::vsetlane(elems[4], v, 4);
121 v = wrapper::vsetlane(elems[5], v, 5);
122 v = wrapper::vsetlane(elems[6], v, 6);
123 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000124}
125
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100126Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100127{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000128 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100129
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000130 int pool_stride_x = 0;
131 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000132 PoolingType pool_type = pool_info.pool_type;
133 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100135
Anthony Barbiereaefd002018-07-20 17:49:35 +0100136 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000137 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000138 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000139
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000140 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100141 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000142 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
144 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
145 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100146 }
147
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000148 return Status{};
149}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100150
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000151Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000152{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000153 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
154 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000155
156 return Status{};
157}
158
159std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
160 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000161 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000162{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100163 // Output auto inizialitation if not yet initialized
164 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
165
Michalis Spyrou57dac842018-03-01 16:03:50 +0000166 DataLayout data_layout = input->data_layout();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000167 unsigned int num_elems_read_per_iteration = 0;
168 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000169 int pool_stride_x = 0;
170 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000171 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
172 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
173 const int input_width = input->dimension(idx_width);
174 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000175 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000176 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000177 const int pool_pad_right = pad_stride_info.pad_right();
178 const int pool_pad_top = pad_stride_info.pad_top();
179 const int pool_pad_left = pad_stride_info.pad_left();
180 const int pool_pad_bottom = pad_stride_info.pad_bottom();
181 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000182
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000183 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000184 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
185 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000186 pool_size_x,
187 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000190 //If it's not squared and optimized will be executed the MxN
191 num_elems_read_per_iteration = 1;
192 num_elems_processed_per_iteration = 1;
193 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194
Michalis Spyrou57dac842018-03-01 16:03:50 +0000195 const bool is_nhwc = data_layout == DataLayout::NHWC;
196
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000197 if(is_square)
198 {
199 switch(input->data_type())
200 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000201 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000202 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000203 if(is_nhwc)
204 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100205 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000206 break;
207 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000208 switch(pool_size_x)
209 {
210 case 2:
211 num_elems_read_per_iteration = 16;
212 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
213 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
214 break;
215 case 3:
216 num_elems_read_per_iteration = 16;
217 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
218 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
219 break;
220 default:
221 break;
222 }
223 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000224#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
225 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000226 if(is_nhwc)
227 {
228 num_elems_processed_per_iteration = 8;
229 break;
230 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000231 switch(pool_size_x)
232 {
233 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000234 case 3:
235 num_elems_read_per_iteration = 4;
236 num_elems_processed_per_iteration = 1;
237 num_elems_horizontal_window = 1;
238 break;
239 default:
240 break;
241 }
242 break;
243#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
244 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000245 if(is_nhwc)
246 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100247 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000248 break;
249 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000250 switch(pool_size_x)
251 {
252 case 2:
253 num_elems_read_per_iteration = 2;
254 break;
255 case 3:
256 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
257 break;
258 case 7:
259 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
260 break;
261 default:
262 break;
263 }
264 num_elems_processed_per_iteration = 1;
265 num_elems_horizontal_window = 1;
266 break;
267 default:
268 ARM_COMPUTE_ERROR("Element size not supported");
269 break;
270 }
271 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000272 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000273 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000274 if(is_nhwc)
275 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100276 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000277 }
278 }
279
280 bool window_changed = false;
281 Window win{};
282 if(data_layout == DataLayout::NCHW)
283 {
284 // Number of iterations in X dimension
285 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
286
287 // Upper limit for the number of right/bottom border elements that are accessed
288 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
289 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
290
291 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
292 border_size.right = std::max(upper_bound_w, pool_pad_right);
293 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
294
295 TensorShape output_shape{ input->tensor_shape() };
296 output_shape.set(0, pooled_w);
297 output_shape.set(1, pooled_h);
298 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
299
300 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
301 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
302
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000303 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
304 window_changed = update_window_and_padding(win, input_access, output_access);
305 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
306 }
307 else
308 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000309 TensorShape output_shape{ input->tensor_shape() };
310 output_shape.set(1, pooled_w);
311 output_shape.set(2, pooled_h);
312 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
313
314 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
315 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
316
317 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
318 window_changed = update_window_and_padding(win, input_access, output_access);
319 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000320 }
321
322 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
323 return std::make_pair(err, win);
324}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000325
326template <typename T>
327inline T vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi);
328
329template <>
330inline uint8x8_t vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
331{
332 return vquantize(qv, qi);
333}
334
335template <>
336inline int8x8_t vquantize_q8(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
337{
338 return vquantize_signed(qv, qi);
339}
340
341template <typename T>
342inline T vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
343
344template <>
345inline uint8x16_t vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
346{
347 return vquantize(qv, qi);
348}
349
350template <>
351inline int8x16_t vquantize_q8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
352{
353 return vquantize_signed(qv, qi);
354}
355
356template <typename T>
357inline T vcvtq_q32_f32(float32x4_t values);
358
359template <>
360inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
361{
362 return vcvtq_u32_f32(values);
363}
364
365template <>
366inline int32x4_t vcvtq_q32_f32(float32x4_t values)
367{
368 return vcvtq_s32_f32(values);
369}
370
371template <typename T>
372inline float32x4_t vcvtq_f32_q32(T values);
373
374template <>
375inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
376{
377 return vcvtq_f32_u32(values);
378}
379
380template <>
381inline float32x4_t vcvtq_f32_q32(int32x4_t values)
382{
383 return vcvtq_f32_s32(values);
384}
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000385} // namespace
386
387NEPoolingLayerKernel::NEPoolingLayerKernel()
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000388 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000389{
390}
391
392BorderSize NEPoolingLayerKernel::border_size() const
393{
394 return _border_size;
395}
396
397void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
398{
399 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
400
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000401 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
402 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000403 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000404
405 // Get data layout
406 const DataLayout data_layout = input->info()->data_layout();
407 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
408 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000409
410 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000411 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000412 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
413 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000414
415 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000416 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000417
418 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100419 unsigned int pooled_w;
420 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000421 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
422 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000423 pool_size.x(),
424 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000425 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000426
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000427 // Perform validation step
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100428 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100429
430 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000431 _input = input;
432 _output = output;
433 _pool_info = pool_info;
434 _data_layout = input->info()->data_layout();
435 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100436
Georgios Pinitas55186712018-01-08 17:37:12 +0000437 // Get data type
438 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000439 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000440
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100441 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000442 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000443 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000444 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000445 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100446 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000447 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000448 }
449 else
450 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000451 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000452 }
453 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000454 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000455 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000456 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000457 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000458 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000459 }
460 else
461 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000462 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000463 }
464 }
465 else
466 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000467 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000468 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000469 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000470 }
471 else
472 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000473 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
474 }
475 }
476 }
477 else if(data_type == DataType::QASYMM8_SIGNED)
478 {
479 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
480 {
481 if(is_nchw)
482 {
483 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
484 }
485 else
486 {
487 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
488 }
489 }
490 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
491 {
492 if(is_nchw)
493 {
494 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
495 }
496 else
497 {
498 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
499 }
500 }
501 else
502 {
503 if(is_nchw)
504 {
505 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
506 }
507 else
508 {
509 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000510 }
511 }
512 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000513 else if(data_type == DataType::F16)
514 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000515 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000516 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000517 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000518 {
519 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000520 {
521 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000522 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000523 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000524 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000525 else
526 {
527 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
528 }
529 }
530 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000531 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000532 {
533 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000534 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000535 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000536 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000537 else
538 {
539 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
540 }
541 }
542 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000543 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000544 {
545 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000546 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000547 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
548 }
549 else
550 {
551 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000552 }
553 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000554 }
555 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000556 }
557 }
558 else
559 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000560 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000561 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000562 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
563 }
564 else
565 {
566 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000567 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000568 }
569 }
570 else if(data_type == DataType::F32)
571 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000572 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000573 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000574 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000575 {
576 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000577 {
578 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000579 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000580 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
581 }
582 else
583 {
584 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000585 }
586 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000587 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000588 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000589 {
590 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000591 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000592 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
593 }
594 else
595 {
596 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000597 }
598 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000599 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000600 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000601 {
602 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000603 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000604 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
605 }
606 else
607 {
608 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000609 }
610 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000611 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000612 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000613 {
614 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000615 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000616 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
617 }
618 else
619 {
620 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000621 }
622 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000623 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000624 }
625 }
626 else
627 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000628 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000629 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000630 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
631 }
632 else
633 {
634 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000635 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000636 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100637 }
638
639 // Configure kernel window
Pablo Tello77e6c552018-12-04 15:33:49 +0000640 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000641 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
642 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100643}
644
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000645template <typename T>
646void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000647{
648 Iterator input(_input, window_input);
649 Iterator output(_output, window);
650
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000651 /** NEON vector types */
652 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
653 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
654 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
655 using q16_t = typename wrapper::traits::promote_t<T>;
656 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
657 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
658 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
659
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000660 constexpr int pool_size = 2;
661 int pool_stride_x = 0;
662 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000663 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
664 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
665 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
666 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
667 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000668 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
669 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000670
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000671 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
672 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000673
674 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
675
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100676 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
677 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
678 const bool have_different_qinfo = input_qinfo != output_qinfo;
679
Georgios Pinitas55186712018-01-08 17:37:12 +0000680 execute_window_loop(window, [&](const Coordinates & id)
681 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000682 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
683 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
684 q8x8_t lower_res = {};
685 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000686
687 if(pooling_type != PoolingType::MAX)
688 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000689 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
690 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000691
692 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000693 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000694 {
695 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000696 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
697 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000698 }
699 };
700
701 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000702 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
703 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000704
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000705 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000706
707 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000708 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
709 pool_size, upper_bound_w, upper_bound_h,
710 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
711 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000712
713 // Compute upper result for stride_x == 1
714 if(pool_stride_x == 1)
715 {
716 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000717 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000718 {
719 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000720 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
721 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000722 }
723 };
724
725 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000726 q16x8_t res_upper = wrapper::vcombine(
727 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
728 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000729
730 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000731 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
732 pool_size, upper_bound_w, upper_bound_h,
733 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
734 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000735 }
736 }
737 else
738 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000739 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
740 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000741 if(pool_stride_x == 1)
742 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000743 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
744 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000745 }
746 }
747
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100748 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100749 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000750 const auto requantized_output = vquantize_q8<q8x16_t>(vdequantize(wrapper::vcombine(lower_res, upper_res), input_qinfo), output_qinfo);
751 lower_res = wrapper::vgetlow(requantized_output);
752 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100753 }
754
Georgios Pinitas55186712018-01-08 17:37:12 +0000755 // Store result
756 if(pool_stride_x == 1)
757 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000758 const q8x8x2_t res = { { lower_res, upper_res } };
759 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000760 }
761 else
762 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000763 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000764 }
765 },
766 input, output);
767}
768
Pablo Tello77e6c552018-12-04 15:33:49 +0000769void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100770{
Pablo Tello77e6c552018-12-04 15:33:49 +0000771 ARM_COMPUTE_UNUSED(pooling_type);
772 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000773#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100774 Iterator input(_input, window_input);
775 Iterator output(_output, window);
776
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000777 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000778 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
779 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
780 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
781 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000782 int pool_stride_x = 0;
783 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000784 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000785 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
786 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100787
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000788 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
789 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
790 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100791
792 execute_window_loop(window, [&](const Coordinates & id)
793 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100794 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
795 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
796 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
797 float16x4_t res = {};
798
799 // Get power of 2 in case of l2 pooling
800 if(pooling_type == PoolingType::L2)
801 {
802 top_data = vmul_f16(top_data, top_data);
803 middle_data = vmul_f16(middle_data, middle_data);
804 bottom_data = vmul_f16(bottom_data, bottom_data);
805 }
806
807 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100808 {
809 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000810 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100811 const float16x4_t scale_v = vdup_n_f16(scale);
812 // Perform pooling
813 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
814 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
815 res = vmul_f16(vpadd_f16(res, res), scale_v);
816 }
817 else
818 {
819 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
820 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
821 res = vpmax_f16(res, res);
822 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100823
824 // Calculate square-root in case of l2 pooling
825 if(pooling_type == PoolingType::L2)
826 {
827 res = vinv_f16(vinvsqrt_f16(res));
828 }
829
Pablo Tello0c34fe22017-06-26 17:17:42 +0100830 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
831 },
832 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000833#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100834 ARM_COMPUTE_UNUSED(window_input);
835 ARM_COMPUTE_UNUSED(window);
836 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000837#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100838}
839
Pablo Tello77e6c552018-12-04 15:33:49 +0000840void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100841{
Pablo Tello77e6c552018-12-04 15:33:49 +0000842 ARM_COMPUTE_UNUSED(pooling_type);
843 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000844#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100845 Iterator input(_input, window_input);
846 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000847 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000848 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
849 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
850 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
851 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000852 int pool_stride_x, pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000853 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000854 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
855 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100856
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000857 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
858 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100859
860 execute_window_loop(window, [&](const Coordinates & id)
861 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100862 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
863 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
864 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100865
Georgios Pinitascdf51452017-08-31 14:21:36 +0100866 // Get power of 2 in case of l2 pooling
867 if(pooling_type == PoolingType::L2)
868 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100869 top_data = vmul_f16(top_data, top_data);
870 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100871 }
872
873 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100874 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000875 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100876 const float16x4_t scale_v = vdup_n_f16(scale);
877
878 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
879 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100880 }
881 else
882 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100883 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
884 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100885 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100886
887 // Calculate square-root in case of l2 pooling
888 if(pooling_type == PoolingType::L2)
889 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100890 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100891 }
892
893 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100894 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100895 },
896 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000897#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100898 ARM_COMPUTE_UNUSED(window_input);
899 ARM_COMPUTE_UNUSED(window);
900 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000901#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100902}
903
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000904template <typename T>
905void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000906{
907 Iterator input(_input, window_input);
908 Iterator output(_output, window);
909
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000910 /** NEON vector types */
911 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
912 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
913 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
914 using q16_t = typename wrapper::traits::promote_t<T>;
915 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
916 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
917
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000918 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000919 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
920 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
921 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
922 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000923 int pool_stride_x = 0;
924 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000925 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000926 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
927 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000928
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100929 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
930 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +0100931
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000932 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
933 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
934 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000935
936 execute_window_loop(window, [&](const Coordinates & id)
937 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000938 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
939 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
940 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
941 q8x8_t fres = {};
942 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000943
944 if(pooling_type == PoolingType::AVG)
945 {
946 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000947 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
948 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
949 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000950
951 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000952 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000953 {
954 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000955 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
956 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000957 }
958 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000959 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +0000960 {
961 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000962 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
963 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000964 }
965 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000966 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +0000967 {
968 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000969 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
970 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000971 }
972 };
973 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000974 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000975 {
976 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000977 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
978 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000979 }
980 };
981 if(pool_stride_x == 2)
982 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000983 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +0000984 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000985 wrapper::vgetlane(final_sum.val[0], 0),
986 wrapper::vgetlane(final_sum.val[0], 2),
987 wrapper::vgetlane(final_sum.val[0], 4),
988 wrapper::vgetlane(final_sum.val[0], 6),
989 wrapper::vgetlane(final_sum.val[1], 0),
990 wrapper::vgetlane(final_sum.val[1], 2),
991 wrapper::vgetlane(final_sum.val[1], 4),
992 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +0000993 };
994
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000995 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
996 pool_size, upper_bound_w, upper_bound_h,
997 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
998 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000999 }
1000 else
1001 {
1002 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001003 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1004 pool_size, upper_bound_w, upper_bound_h,
1005 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001006 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001007 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1008 pool_size, upper_bound_w, upper_bound_h,
1009 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1010 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001011 }
1012 }
1013 else
1014 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001015 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1016 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1017 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1018 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001019
1020 if(pool_stride_x == 2)
1021 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001022 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1023 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1024 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001025 }
1026 else
1027 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001028 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001029 }
1030 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001031
1032 // Store result
1033 if(pool_stride_x == 1)
1034 {
1035 if(input_qinfo != output_qinfo)
1036 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001037 fqres = vquantize_q8<q8x16_t>(vdequantize(fqres, input_qinfo), output_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001038 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001039 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001040 }
1041 else
1042 {
1043 if(input_qinfo != output_qinfo)
1044 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001045 fres = vquantize_q8<q8x8_t>(vdequantize(fres, input_qinfo), output_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001046 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001047 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001048 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001049 },
1050 input, output);
1051}
1052
Pablo Tello77e6c552018-12-04 15:33:49 +00001053void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001054{
Pablo Tello77e6c552018-12-04 15:33:49 +00001055 ARM_COMPUTE_UNUSED(pooling_type);
1056 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001057#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1058 Iterator input(_input, window_input);
1059 Iterator output(_output, window);
1060
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001061 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1062 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1063 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1064 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1065 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1066 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001067 int pool_stride_x = 0;
1068 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001069 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001070 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1071 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1072
1073 execute_window_loop(window, [&](const Coordinates & id)
1074 {
1075 float16_t res = 0.0f;
1076 float16x8_t vres = vdupq_n_f16(0.0f);
1077
1078 if(pooling_type != PoolingType::MAX)
1079 {
1080 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001081 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001082
1083 // Perform pooling
1084
1085 for(int y = 0; y < pool_size_y; ++y)
1086 {
1087 int x = 0;
1088 for(; x <= (pool_size_x - 8); x += 8)
1089 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001090 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1091 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001092
1093 // Get power of 2 in case of l2 pooling and accumulate
1094 if(pooling_type == PoolingType::L2)
1095 {
1096 vres = vaddq_f16(vres, vmulq_f16(data, data));
1097 }
1098 else
1099 {
1100 vres = vaddq_f16(vres, data);
1101 }
1102 }
1103
1104 // Leftover for loop
1105 for(; x < pool_size_x; ++x)
1106 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001107 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1108 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001109
1110 // Get power of 2 in case of l2 pooling
1111 if(pooling_type == PoolingType::L2)
1112 {
1113 data *= data;
1114 }
1115
1116 res += data;
1117 }
1118 }
1119
1120 // Reduction
1121 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1122 res += vget_lane_f16(tmp, 0);
1123 res += vget_lane_f16(tmp, 1);
1124 res += vget_lane_f16(tmp, 2);
1125 res += vget_lane_f16(tmp, 3);
1126
1127 // Divide by scale
1128 res *= scale;
1129 }
1130 else
1131 {
1132 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1133 res = std::numeric_limits<float>::lowest();
1134
1135 for(int y = 0; y < pool_size_y; ++y)
1136 {
1137 int x = 0;
1138 for(; x <= (pool_size_x - 8); x += 8)
1139 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001140 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1141 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001142 vres = vmaxq_f16(vres, data);
1143 }
1144
1145 // Leftover for loop
1146 for(; x < pool_size_x; ++x)
1147 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001148 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1149 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1150 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001151 }
1152 }
1153
1154 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1155 res = std::max(res, vget_lane_f16(tmp, 0));
1156 res = std::max(res, vget_lane_f16(tmp, 1));
1157 res = std::max(res, vget_lane_f16(tmp, 2));
1158 res = std::max(res, vget_lane_f16(tmp, 3));
1159 }
1160
1161 // Calculate square-root in case of l2 pooling
1162 if(pooling_type == PoolingType::L2)
1163 {
1164 res = std::sqrt(res);
1165 }
1166
1167 // Store result
1168 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1169 },
1170 input, output);
1171
1172#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1173 ARM_COMPUTE_UNUSED(window_input);
1174 ARM_COMPUTE_UNUSED(window);
1175 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1176#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1177}
1178
Pablo Tello77e6c552018-12-04 15:33:49 +00001179void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001180{
Pablo Tello77e6c552018-12-04 15:33:49 +00001181 ARM_COMPUTE_UNUSED(pooling_type);
1182 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001183#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1184 Iterator input(_input, window_input);
1185 Iterator output(_output, window);
1186
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001187 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1188 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1189 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1190 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1191 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1192 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001193 int pool_stride_x = 0;
1194 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001195 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001196 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1197 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1198
1199 float16x8_t vres;
1200
1201 execute_window_loop(window, [&](const Coordinates & id)
1202 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001203 const int idx_width = id.y() * pool_stride_x;
1204 const int idx_height = id.z() * pool_stride_y;
1205 const int pool_limit_y = pool_pad_top - idx_height;
1206 const int pool_limit_x = pool_pad_left - idx_width;
1207
1208 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1209 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1210 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1211 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1212
Michalis Spyrou57dac842018-03-01 16:03:50 +00001213 if(pooling_type != PoolingType::MAX)
1214 {
1215 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001216 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1217 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001218 const float16x8_t scale_v = vdupq_n_f16(scale);
1219
1220 // Perform pooling
1221 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001222 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001223 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001224 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001225 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001226 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1227 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001228
1229 // Get power of 2 in case of l2 pooling and accumulate
1230 if(pooling_type == PoolingType::L2)
1231 {
1232 vres = vaddq_f16(vres, vmulq_f16(data, data));
1233 }
1234 else
1235 {
1236 vres = vaddq_f16(vres, data);
1237 }
1238 }
1239 }
1240 // Divide by scale
1241 vres = vmulq_f16(vres, scale_v);
1242 }
1243 else
1244 {
1245 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001246
1247 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001248 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001249 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001250 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001251 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1252 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001253 vres = vmaxq_f16(vres, data);
1254 }
1255 }
1256 }
1257
1258 // Calculate square-root in case of l2 pooling
1259 if(pooling_type == PoolingType::L2)
1260 {
1261 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1262 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1263 }
1264
1265 // Store result
1266 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1267 },
1268 input, output);
1269
1270#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1271 ARM_COMPUTE_UNUSED(window_input);
1272 ARM_COMPUTE_UNUSED(window);
1273 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1274#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1275}
1276
Pablo Tello77e6c552018-12-04 15:33:49 +00001277void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001278{
1279 Iterator input(_input, window_input);
1280 Iterator output(_output, window);
1281
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001282 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1283 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1284 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1285 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1286 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1287 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001288 int pool_stride_x = 0;
1289 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001290 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001291 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1292 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001293
1294 execute_window_loop(window, [&](const Coordinates & id)
1295 {
1296 float res = 0.0f;
1297
1298 if(pooling_type != PoolingType::MAX)
1299 {
1300 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001301 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001302
1303 // Perform pooling
1304 float32x4_t vres = vdupq_n_f32(0.0f);
1305
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001306 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001307 {
1308 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001309 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001310 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001311 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1312 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001313
1314 // Get power of 2 in case of l2 pooling and accumulate
1315 if(pooling_type == PoolingType::L2)
1316 {
1317 vres = vmlaq_f32(vres, data, data);
1318 }
1319 else
1320 {
1321 vres = vaddq_f32(vres, data);
1322 }
1323 }
1324
1325 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001326 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001327 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001328 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1329 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001330
1331 // Get power of 2 in case of l2 pooling
1332 if(pooling_type == PoolingType::L2)
1333 {
1334 data *= data;
1335 }
1336
1337 res += data;
1338 }
1339 }
1340
1341#if defined(__aarch64__)
1342 // Reduction operation available on 64 bit architectures only
1343 res += vaddvq_f32(vres);
1344#else // __aarch64__
1345 // Reduction
1346 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1347 tmp = vpadd_f32(tmp, tmp);
1348
1349 res += vget_lane_f32(tmp, 0);
1350#endif // __aarch64__
1351 // Divide by scale
1352 res *= scale;
1353 }
1354 else
1355 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001356 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1357 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001358
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001359 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001360 {
1361 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001362 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001363 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001364 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1365 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001366 vres = vmaxq_f32(vres, data);
1367 }
1368
1369 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001370 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001371 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001372 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1373 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001374 res = std::max(res, data);
1375 }
1376 }
1377
1378#if defined(__aarch64__)
1379 // Reduction operation available on 64 bit architectures only
1380 res = std::max(vmaxvq_f32(vres), res);
1381#else // __aarch64__
1382 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1383 tmp = vpmax_f32(tmp, tmp);
1384
1385 res = std::max(res, vget_lane_f32(tmp, 0));
1386#endif // __aarch64__
1387 }
1388
1389 // Calculate square-root in case of l2 pooling
1390 if(pooling_type == PoolingType::L2)
1391 {
1392 res = std::sqrt(res);
1393 }
1394
1395 // Store result
1396 *(reinterpret_cast<float *>(output.ptr())) = res;
1397 },
1398 input, output);
1399}
1400
Pablo Tello77e6c552018-12-04 15:33:49 +00001401void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1402{
1403 Iterator input(_input, window_input);
1404 Iterator output(_output, window);
1405
1406 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001407 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1408 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1409 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1410 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001411 int pool_stride_x = 0;
1412 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001413 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001414 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1415 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1416
1417 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1418 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1419
1420 execute_window_loop(window, [&](const Coordinates & id)
1421 {
1422 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1423 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1424 float32x2_t res = {};
1425 float final_res = 0;
1426
1427 // Get power of 2 in case of l2 pooling
1428 if(pooling_type == PoolingType::L2)
1429 {
1430 top_data = vmul_f32(top_data, top_data);
1431 bottom_data = vmul_f32(bottom_data, bottom_data);
1432 }
1433
1434 if(pooling_type != PoolingType::MAX)
1435 {
1436 // Calculate scale
1437 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1438 const float32x2_t scale_v = vdup_n_f32(scale);
1439
1440 // Perform pooling
1441 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1442 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1443 }
1444 else
1445 {
1446 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1447 res = vpmax_f32(max_data, max_data);
1448 }
1449 final_res = vget_lane_f32(res, 0);
1450
1451 // Calculate square-root in case of l2 pooling
1452 if(pooling_type == PoolingType::L2)
1453 {
1454 final_res = sqrt(final_res);
1455 }
1456
1457 // Store result
1458 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1459 },
1460 input, output);
1461}
1462
1463void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1464{
1465 Iterator input(_input, window_input);
1466 Iterator output(_output, window);
1467
1468 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001469 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1470 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1471 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1472 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001473 int pool_stride_x = 0;
1474 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001475 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001476 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1477 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1478
1479 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1480 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1481 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1482
1483 execute_window_loop(window, [&](const Coordinates & id)
1484 {
1485 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1486 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1487 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1488 float32x2_t res = {};
1489 float final_res = 0;
1490
1491 // Get power of 2 in case of l2 pooling
1492 if(pooling_type == PoolingType::L2)
1493 {
1494 top_data = vmulq_f32(top_data, top_data);
1495 middle_data = vmulq_f32(middle_data, middle_data);
1496 bottom_data = vmulq_f32(bottom_data, bottom_data);
1497 }
1498
1499 if(pooling_type != PoolingType::MAX)
1500 {
1501 // Calculate scale
1502 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1503 const float32x2_t scale_v = vdup_n_f32(scale);
1504
1505 // Perform pooling
1506 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1507 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1508 res = vmul_f32(vpadd_f32(res, res), scale_v);
1509 }
1510 else
1511 {
1512 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1513 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1514 res = vpmax_f32(res, res);
1515 }
1516 final_res = vget_lane_f32(res, 0);
1517
1518 // Calculate square-root in case of l2 pooling
1519 if(pooling_type == PoolingType::L2)
1520 {
1521 final_res = sqrt(final_res);
1522 }
1523
1524 // Store result
1525 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1526 },
1527 input, output);
1528}
1529
1530void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1531{
1532 Iterator input(_input, window_input);
1533 Iterator output(_output, window);
1534
1535 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001536 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1537 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1538 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1539 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001540 int pool_stride_x = 0;
1541 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001542 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001543 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1544 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1545
1546 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1547 for(int i = 0; i < pool_size; ++i)
1548 {
1549 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1550 }
1551
1552 execute_window_loop(window, [&](const Coordinates & id)
1553 {
1554 float32x2_t res = {};
1555 float final_res = 0.f;
1556 if(pooling_type != PoolingType::MAX)
1557 {
1558 // Calculate scale
1559 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1560 const float32x2_t scale_v = vdup_n_f32(scale);
1561
1562 // Perform pooling
1563 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1564 // Get power of 2 in case of l2 pooling
1565 if(pooling_type == PoolingType::L2)
1566 {
1567 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1568 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1569 }
1570 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1571 for(int i = 1; i < pool_size; ++i)
1572 {
1573 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1574 // Get power of 2 in case of l2 pooling
1575 if(pooling_type == PoolingType::L2)
1576 {
1577 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1578 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1579 }
1580 sum_data = vaddq_f32(sum_data, data.val[0]);
1581 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1582 }
1583 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1584 res = vmul_f32(vpadd_f32(res, res), scale_v);
1585 }
1586 else
1587 {
1588 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1589 for(int i = 1; i < pool_size; ++i)
1590 {
1591 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1592 max_data = vmax2q_f32(max_data, data);
1593 }
1594 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1595 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1596 res = vpmax_f32(res, res);
1597 }
1598 final_res = vget_lane_f32(res, 0);
1599
1600 // Calculate square-root in case of l2 pooling
1601 if(pooling_type == PoolingType::L2)
1602 {
1603 final_res = sqrt(final_res);
1604 }
1605
1606 // Store result
1607 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1608 },
1609 input, output);
1610}
1611
1612void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001613{
1614 Iterator input(_input, window_input);
1615 Iterator output(_output, window);
1616
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001617 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1618 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1619 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1620 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1621 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1622 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001623 int pool_stride_x = 0;
1624 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001625 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001626 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1627 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1628
1629 float32x4_t vres;
1630
1631 execute_window_loop(window, [&](const Coordinates & id)
1632 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001633 const int idx_width = id.y() * pool_stride_x;
1634 const int idx_height = id.z() * pool_stride_y;
1635 const int pool_limit_y = pool_pad_top - idx_height;
1636 const int pool_limit_x = pool_pad_left - idx_width;
1637
1638 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1639 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1640 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1641 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1642
Michalis Spyrou57dac842018-03-01 16:03:50 +00001643 if(pooling_type != PoolingType::MAX)
1644 {
1645 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001646 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1647 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001648 const float32x4_t scale_v = vdupq_n_f32(scale);
1649
1650 // Perform pooling
1651 vres = vdupq_n_f32(0.0f);
1652
Michalis Spyrouced25572018-10-01 16:26:20 +01001653 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001654 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001655 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001656 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001657 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1658 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001659
1660 // Get power of 2 in case of l2 pooling and accumulate
1661 if(pooling_type == PoolingType::L2)
1662 {
1663 vres = vmlaq_f32(vres, data, data);
1664 }
1665 else
1666 {
1667 vres = vaddq_f32(vres, data);
1668 }
1669 }
1670 }
1671 // Divide by scale
1672 vres = vmulq_f32(vres, scale_v);
1673 }
1674 else
1675 {
1676 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001677 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001678 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001679 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001680 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001681 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1682 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001683 vres = vmaxq_f32(vres, data);
1684 }
1685 }
1686 }
1687
1688 // Calculate square-root in case of l2 pooling
1689 if(pooling_type == PoolingType::L2)
1690 {
Georgios Pinitas27f223d2019-12-16 19:23:02 +00001691 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1692 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1693 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1694 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1695 };
1696 vres = l2_res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00001697 }
1698
1699 // Store result
1700 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1701 },
1702 input, output);
1703}
1704
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001705template <typename T>
1706void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001707{
1708 Iterator input(_input, window_input);
1709 Iterator output(_output, window);
1710
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001711 /** NEON vector types */
1712 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1713 using q16_t = typename wrapper::traits::promote_t<T>;
1714 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1715 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1716 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1717
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001718 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1719 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1720 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1721 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1722 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1723 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001724 int pool_stride_x = 0;
1725 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001726 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001727 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1728 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001729
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001730 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1731 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1732
Georgios Pinitas55186712018-01-08 17:37:12 +00001733 execute_window_loop(window, [&](const Coordinates & id)
1734 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001735 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00001736
1737 if(pooling_type != PoolingType::MAX)
1738 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001739 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1740 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001741
1742 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001743 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001744
1745 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001746 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001747 {
1748 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001749 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001750 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001751 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1752 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001753
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001754 const q16x8_t data_q16 = wrapper::vmovl(data);
1755 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001756 }
1757
1758 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001759 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001760 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001761 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1762 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001763 sres += data;
1764 }
1765 }
1766
1767 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001768 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
1769 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00001770
1771 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001772 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00001773 }
1774 else
1775 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001776 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00001777
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001778 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001779 {
1780 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001781 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001782 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001783 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1784 (_input->info()->strides_in_bytes().y())));
1785 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001786 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001787 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001788 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001789 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001790 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1791 (_input->info()->strides_in_bytes().y())));
1792 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001793 }
1794 }
1795
1796 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001797 vres = wrapper::vpmax(vres, vres);
1798 vres = wrapper::vpmax(vres, vres);
1799 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00001800
1801 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001802 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00001803 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001804 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001805 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
1806 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00001807 },
1808 input, output);
1809}
1810
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001811template <typename T>
1812void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001813{
1814 Iterator input(_input, window_input);
1815 Iterator output(_output, window);
1816
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001817 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1818 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1819 using q16_t = typename wrapper::traits::promote_t<T>;
1820 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1821 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1822 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1823
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001824 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1825 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1826 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1827 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1828 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1829 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001830
1831 int pool_stride_x = 0;
1832 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001833 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001834 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1835 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1836
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001837 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
1838 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
1839 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00001840
Michalis Spyrou57dac842018-03-01 16:03:50 +00001841 execute_window_loop(window, [&](const Coordinates & id)
1842 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001843 const int idx_width = id.y() * pool_stride_x;
1844 const int idx_height = id.z() * pool_stride_y;
1845 const int pool_limit_y = pool_pad_top - idx_height;
1846 const int pool_limit_x = pool_pad_left - idx_width;
1847
1848 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1849 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1850 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1851 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1852
Michalis Spyrou57dac842018-03-01 16:03:50 +00001853 if(pooling_type != PoolingType::MAX)
1854 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001855 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1856 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1857 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1858 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00001859
1860 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001861 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1862 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001863 const float32x4_t scale_v = vdupq_n_f32(scale);
1864
1865 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01001866 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001867 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001868 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001869 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001870 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1871 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001872
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001873 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
1874 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
1875 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
1876 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
1877 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
1878 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001879 }
1880 }
Georgios Pinitas283fc602018-11-09 10:46:43 +00001881 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001882 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
1883 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
1884 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
1885 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001886
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001887 q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
1888 q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01001889 if(input_qinfo != output_qinfo)
1890 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001891 const auto requantized_output = vquantize_q8<q8x16_t>(vdequantize(wrapper::vcombine(res1, res2), input_qinfo), output_qinfo);
1892 res1 = wrapper::vgetlow(requantized_output);
1893 res2 = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +01001894 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001895
1896 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001897 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
1898 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001899 }
1900 else
1901 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001902 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00001903
Michalis Spyrouced25572018-10-01 16:26:20 +01001904 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001905 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001906 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001907 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001908 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1909 (_input->info()->strides_in_bytes().z())));
1910 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001911 }
1912 }
1913
1914 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001915 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vquantize_q8<q8x16_t>(vdequantize(vres, input_qinfo), output_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001916 }
1917 },
1918 input, output);
1919}
1920
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001921Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1922{
1923 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1924
1925 unsigned int pooled_w = 0;
1926 unsigned int pooled_h = 0;
1927 unsigned int num_elems_processed_per_iteration = 0;
1928 BorderSize border_size(0);
1929
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001930 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00001931 unsigned int pool_size_x = 0;
1932 unsigned int pool_size_y = 0;
1933
1934 // Get data layout
1935 const DataLayout data_layout = input->data_layout();
1936 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
1937 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
1938
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001939 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
1940 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001941
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001942 // Validate pool info before calling scaled_dimensions
1943 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001944
1945 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00001946 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
1947 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001948 pool_size_x,
1949 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001950 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001951
Georgios Pinitas13d96e02018-08-23 11:20:23 +01001952 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001953 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
1954 pool_size_x, pool_size_y)
1955 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001956
1957 return Status{};
1958}
1959
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001960void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001961{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001962 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001963 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1964 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1965 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1966
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001967 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
1968 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
1969 const unsigned int pool_size = _pool_info.pool_size.width;
1970 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001971
Michalis Spyrou57dac842018-03-01 16:03:50 +00001972 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00001973 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001974 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001975 // Set step for input in x and y direction for the input
1976 unsigned int window_x_inc = 0;
1977 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01001978 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001979 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001980 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00001981 {
1982 window_x_inc = pool_stride_x;
1983 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
1984 {
1985 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1986 }
1987 break;
1988 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001989
Georgios Pinitas13d96e02018-08-23 11:20:23 +01001990 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00001991 case DataType::F32:
1992 {
1993 window_x_inc = pool_stride_x;
1994 break;
1995 }
1996 default:
1997 {
1998 ARM_COMPUTE_ERROR("Not supported");
1999 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002000 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002001 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2002 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002003 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002004 else
2005 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002006 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002007 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2008 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2009 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002010
2011 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002012 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002013}
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002014} // namespace arm_compute