blob: 326bc77fcd93bb2e10227d67569535253448969e [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002 * Copyright (c) 2017-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Georgios Pinitas55186712018-01-08 17:37:12 +000040#include "support/ToolchainSupport.h"
41
Manuel Bottinib4bb8272019-12-18 18:01:27 +000042#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043#include <algorithm>
44#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010045#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010047#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048#include <string>
49#include <tuple>
50
Manuel Bottinib4bb8272019-12-18 18:01:27 +000051namespace arm_compute
52{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010053using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054
55namespace
56{
Pablo Tello77e6c552018-12-04 15:33:49 +000057inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
59{
Michalis Spyrou57dac842018-03-01 16:03:50 +000060 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
61 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
62
63 int start_x = id[idx_width] * stride_x - pad_x;
64 int start_y = id[idx_height] * stride_y - pad_y;
65
66 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
67 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000068 if(exclude_padding)
69 {
70 start_x = std::max(0, start_x);
71 start_y = std::max(0, start_y);
72 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 return 1.f / ((end_y - start_y) * (end_x - start_x));
74}
75
Manuel Bottinib4bb8272019-12-18 18:01:27 +000076template <typename T, typename TVec>
77inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000078 const int pool_size, const int upper_bound_w, const int upper_bound_h,
79 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
80{
81 int start_x = (id.x() + id_offset) * stride_x - pad_x;
82 int start_y = id.y() * stride_y - pad_y;
83 const int end_y = std::min(start_y + pool_size, upper_bound_h);
84 if(exclude_padding)
85 {
86 start_y = std::max(0, start_y);
87 }
88
Manuel Bottinib4bb8272019-12-18 18:01:27 +000089 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +000090 {
91 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +000092 wrapper::vgetlane(v, 0),
93 wrapper::vgetlane(v, 1),
94 wrapper::vgetlane(v, 2),
95 wrapper::vgetlane(v, 3),
96 wrapper::vgetlane(v, 4),
97 wrapper::vgetlane(v, 5),
98 wrapper::vgetlane(v, 6),
99 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000100 }
101 };
102
103 for(auto &el : elems)
104 {
105 int c_start_x = start_x;
106 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
107 if(exclude_padding)
108 {
109 c_start_x = std::max(0, c_start_x);
110 }
111 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
112 el *= scale;
113 start_x += step * stride_x;
114 }
115
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000116 v = wrapper::vsetlane(elems[0], v, 0);
117 v = wrapper::vsetlane(elems[1], v, 1);
118 v = wrapper::vsetlane(elems[2], v, 2);
119 v = wrapper::vsetlane(elems[3], v, 3);
120 v = wrapper::vsetlane(elems[4], v, 4);
121 v = wrapper::vsetlane(elems[5], v, 5);
122 v = wrapper::vsetlane(elems[6], v, 6);
123 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000124}
125
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100126Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100127{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000128 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100129
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000130 int pool_stride_x = 0;
131 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000132 PoolingType pool_type = pool_info.pool_type;
133 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100135
Anthony Barbiereaefd002018-07-20 17:49:35 +0100136 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000137 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000138 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000139
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000140 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100141 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000142 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000143 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
144 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
145 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100146 }
147
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000148 return Status{};
149}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100150
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000151Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000152{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000153 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
154 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000155
156 return Status{};
157}
158
159std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
160 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000161 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000162{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100163 // Output auto inizialitation if not yet initialized
164 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
165
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000166 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000167 unsigned int num_elems_read_per_iteration = 0;
168 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000169 int pool_stride_x = 0;
170 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000171 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
172 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
173 const int input_width = input->dimension(idx_width);
174 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000175 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000176 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000177 const int pool_pad_right = pad_stride_info.pad_right();
178 const int pool_pad_top = pad_stride_info.pad_top();
179 const int pool_pad_left = pad_stride_info.pad_left();
180 const int pool_pad_bottom = pad_stride_info.pad_bottom();
181 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000182
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000183 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000184 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
185 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000186 pool_size_x,
187 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000188 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000190 //If it's not squared and optimized will be executed the MxN
191 num_elems_read_per_iteration = 1;
192 num_elems_processed_per_iteration = 1;
193 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194
Michalis Spyrou57dac842018-03-01 16:03:50 +0000195 const bool is_nhwc = data_layout == DataLayout::NHWC;
196
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000197 if(is_square)
198 {
199 switch(input->data_type())
200 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000201 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000202 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000203 if(is_nhwc)
204 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100205 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000206 break;
207 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000208 switch(pool_size_x)
209 {
210 case 2:
211 num_elems_read_per_iteration = 16;
212 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
213 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
214 break;
215 case 3:
216 num_elems_read_per_iteration = 16;
217 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
218 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
219 break;
220 default:
221 break;
222 }
223 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000224#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
225 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000226 if(is_nhwc)
227 {
228 num_elems_processed_per_iteration = 8;
229 break;
230 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000231 switch(pool_size_x)
232 {
233 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000234 case 3:
235 num_elems_read_per_iteration = 4;
236 num_elems_processed_per_iteration = 1;
237 num_elems_horizontal_window = 1;
238 break;
239 default:
240 break;
241 }
242 break;
243#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
244 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000245 if(is_nhwc)
246 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100247 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000248 break;
249 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000250 switch(pool_size_x)
251 {
252 case 2:
253 num_elems_read_per_iteration = 2;
254 break;
255 case 3:
256 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
257 break;
258 case 7:
259 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
260 break;
261 default:
262 break;
263 }
264 num_elems_processed_per_iteration = 1;
265 num_elems_horizontal_window = 1;
266 break;
267 default:
268 ARM_COMPUTE_ERROR("Element size not supported");
269 break;
270 }
271 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000272 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000273 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000274 if(is_nhwc)
275 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100276 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000277 }
278 }
279
280 bool window_changed = false;
281 Window win{};
282 if(data_layout == DataLayout::NCHW)
283 {
284 // Number of iterations in X dimension
285 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
286
287 // Upper limit for the number of right/bottom border elements that are accessed
288 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
289 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
290
291 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
292 border_size.right = std::max(upper_bound_w, pool_pad_right);
293 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
294
295 TensorShape output_shape{ input->tensor_shape() };
296 output_shape.set(0, pooled_w);
297 output_shape.set(1, pooled_h);
298 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
299
300 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
301 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
302
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000303 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
304 window_changed = update_window_and_padding(win, input_access, output_access);
305 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
306 }
307 else
308 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000309 TensorShape output_shape{ input->tensor_shape() };
310 output_shape.set(1, pooled_w);
311 output_shape.set(2, pooled_h);
312 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
313
314 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
315 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
316
317 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
318 window_changed = update_window_and_padding(win, input_access, output_access);
319 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000320 }
321
322 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
323 return std::make_pair(err, win);
324}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000325
326template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000327inline T vcvtq_q32_f32(float32x4_t values);
328
329template <>
330inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
331{
332 return vcvtq_u32_f32(values);
333}
334
335template <>
336inline int32x4_t vcvtq_q32_f32(float32x4_t values)
337{
338 return vcvtq_s32_f32(values);
339}
340
341template <typename T>
342inline float32x4_t vcvtq_f32_q32(T values);
343
344template <>
345inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
346{
347 return vcvtq_f32_u32(values);
348}
349
350template <>
351inline float32x4_t vcvtq_f32_q32(int32x4_t values)
352{
353 return vcvtq_f32_s32(values);
354}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000355
356template <typename Tout>
357inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
358
359template <>
360inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
361{
362 const float new_scale = quant_rescale / scale_pooling;
363 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
364}
365
366template <>
367inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
368{
369 const float new_scale = quant_rescale / scale_pooling;
370 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
371}
372
373template <typename Tin, typename Tout>
374inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
375
376template <>
377inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
378{
379 const float32x4x4_t acc =
380 {
381 {
382 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
383 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
384 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
385 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
386 }
387 };
388 return vquantize(acc, requant_qinfo);
389}
390
391template <>
392inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
393{
394 const float32x4x4_t acc =
395 {
396 {
397 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
398 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
399 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
400 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
401 }
402 };
403 return vquantize_signed(acc, requant_qinfo);
404}
405
406template <typename T>
407inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
408
409template <>
410inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
411{
412 const float32x4x2_t acc =
413 {
414 {
415 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
416 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
417 }
418 };
419 return vquantize(acc, requant_qinfo);
420}
421
422template <>
423inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
424{
425 const float32x4x2_t acc =
426 {
427 {
428 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
429 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
430 }
431 };
432 return vquantize_signed(acc, requant_qinfo);
433}
434
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000435} // namespace
436
437NEPoolingLayerKernel::NEPoolingLayerKernel()
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000438 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000439{
440}
441
442BorderSize NEPoolingLayerKernel::border_size() const
443{
444 return _border_size;
445}
446
447void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
448{
449 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
450
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000451 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
452 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000453 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000454
455 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000456 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
457 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
458 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000459
460 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000461 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000462 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
463 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000464
465 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000466 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000467
468 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100469 unsigned int pooled_w;
470 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000471 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
472 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000473 pool_size.x(),
474 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000475 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000476
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000477 // Perform validation step
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100478 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100479
480 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000481 _input = input;
482 _output = output;
483 _pool_info = pool_info;
484 _data_layout = input->info()->data_layout();
485 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100486
Georgios Pinitas55186712018-01-08 17:37:12 +0000487 // Get data type
488 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000489 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000490
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100491 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000492 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000493 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000494 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000495 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100496 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000497 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000498 }
499 else
500 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000501 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000502 }
503 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000504 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000505 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000506 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000507 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000508 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000509 }
510 else
511 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000512 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000513 }
514 }
515 else
516 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000517 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000518 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000519 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000520 }
521 else
522 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000523 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
524 }
525 }
526 }
527 else if(data_type == DataType::QASYMM8_SIGNED)
528 {
529 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
530 {
531 if(is_nchw)
532 {
533 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
534 }
535 else
536 {
537 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
538 }
539 }
540 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
541 {
542 if(is_nchw)
543 {
544 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
545 }
546 else
547 {
548 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
549 }
550 }
551 else
552 {
553 if(is_nchw)
554 {
555 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
556 }
557 else
558 {
559 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
Georgios Pinitas55186712018-01-08 17:37:12 +0000560 }
561 }
562 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000563 else if(data_type == DataType::F16)
564 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000565 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000566 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000567 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000568 {
569 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000570 {
571 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000572 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000573 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000574 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000575 else
576 {
577 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
578 }
579 }
580 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000581 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000582 {
583 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000584 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000585 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000586 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000587 else
588 {
589 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
590 }
591 }
592 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000593 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000594 {
595 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000596 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000597 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
598 }
599 else
600 {
601 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000602 }
603 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000604 }
605 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000606 }
607 }
608 else
609 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000610 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000611 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000612 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
613 }
614 else
615 {
616 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000617 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000618 }
619 }
620 else if(data_type == DataType::F32)
621 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000622 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000623 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000624 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000625 {
626 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000627 {
628 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000629 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000630 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
631 }
632 else
633 {
634 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000635 }
636 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000637 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000638 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000639 {
640 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000641 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000642 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
643 }
644 else
645 {
646 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000647 }
648 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000649 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000650 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000651 {
652 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000653 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000654 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
655 }
656 else
657 {
658 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000659 }
660 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000661 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000662 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000663 {
664 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000665 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000666 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
667 }
668 else
669 {
670 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000671 }
672 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000673 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000674 }
675 }
676 else
677 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000678 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000679 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000680 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
681 }
682 else
683 {
684 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000685 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000686 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100687 }
688
689 // Configure kernel window
Pablo Tello77e6c552018-12-04 15:33:49 +0000690 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000691 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
692 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100693}
694
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000695template <typename T>
696void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000697{
698 Iterator input(_input, window_input);
699 Iterator output(_output, window);
700
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000701 /** NEON vector types */
702 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
703 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
704 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
705 using q16_t = typename wrapper::traits::promote_t<T>;
706 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
707 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
708 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
709
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000710 constexpr int pool_size = 2;
711 int pool_stride_x = 0;
712 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000713 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
714 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
715 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
716 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
717 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000718 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
719 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000720
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000721 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
722 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000723
724 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
725
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100726 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
727 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
728 const bool have_different_qinfo = input_qinfo != output_qinfo;
729
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000730 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
731 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
732 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
733
Georgios Pinitas55186712018-01-08 17:37:12 +0000734 execute_window_loop(window, [&](const Coordinates & id)
735 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000736 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
737 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
738 q8x8_t lower_res = {};
739 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000740
741 if(pooling_type != PoolingType::MAX)
742 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000743 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
744 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000745
746 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000747 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000748 {
749 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000750 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
751 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000752 }
753 };
754
755 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000756 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
757 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000758
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000759 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000760
761 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000762 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
763 pool_size, upper_bound_w, upper_bound_h,
764 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
765 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000766
767 // Compute upper result for stride_x == 1
768 if(pool_stride_x == 1)
769 {
770 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000771 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000772 {
773 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000774 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
775 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000776 }
777 };
778
779 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000780 q16x8_t res_upper = wrapper::vcombine(
781 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
782 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000783
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000784 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000785 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
786 pool_size, upper_bound_w, upper_bound_h,
787 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
788 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000789 }
790 }
791 else
792 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000793 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
794 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000795 if(pool_stride_x == 1)
796 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000797 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
798 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000799 }
800 }
801
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100802 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100803 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000804 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000805 lower_res = wrapper::vgetlow(requantized_output);
806 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100807 }
808
Georgios Pinitas55186712018-01-08 17:37:12 +0000809 // Store result
810 if(pool_stride_x == 1)
811 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000812 const q8x8x2_t res = { { lower_res, upper_res } };
813 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000814 }
815 else
816 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000817 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000818 }
819 },
820 input, output);
821}
822
Pablo Tello77e6c552018-12-04 15:33:49 +0000823void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100824{
Pablo Tello77e6c552018-12-04 15:33:49 +0000825 ARM_COMPUTE_UNUSED(pooling_type);
826 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000827#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100828 Iterator input(_input, window_input);
829 Iterator output(_output, window);
830
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000831 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000832 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
833 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
834 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
835 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000836 int pool_stride_x = 0;
837 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000838 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000839 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
840 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100841
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000842 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
843 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
844 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100845
846 execute_window_loop(window, [&](const Coordinates & id)
847 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100848 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
849 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
850 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
851 float16x4_t res = {};
852
853 // Get power of 2 in case of l2 pooling
854 if(pooling_type == PoolingType::L2)
855 {
856 top_data = vmul_f16(top_data, top_data);
857 middle_data = vmul_f16(middle_data, middle_data);
858 bottom_data = vmul_f16(bottom_data, bottom_data);
859 }
860
861 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100862 {
863 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000864 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100865 const float16x4_t scale_v = vdup_n_f16(scale);
866 // Perform pooling
867 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
868 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
869 res = vmul_f16(vpadd_f16(res, res), scale_v);
870 }
871 else
872 {
873 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
874 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
875 res = vpmax_f16(res, res);
876 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100877
878 // Calculate square-root in case of l2 pooling
879 if(pooling_type == PoolingType::L2)
880 {
881 res = vinv_f16(vinvsqrt_f16(res));
882 }
883
Pablo Tello0c34fe22017-06-26 17:17:42 +0100884 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
885 },
886 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000887#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100888 ARM_COMPUTE_UNUSED(window_input);
889 ARM_COMPUTE_UNUSED(window);
890 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000891#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100892}
893
Pablo Tello77e6c552018-12-04 15:33:49 +0000894void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100895{
Pablo Tello77e6c552018-12-04 15:33:49 +0000896 ARM_COMPUTE_UNUSED(pooling_type);
897 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000898#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100899 Iterator input(_input, window_input);
900 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000901 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000902 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
903 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
904 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
905 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000906 int pool_stride_x, pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000907 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000908 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
909 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100910
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000911 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
912 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100913
914 execute_window_loop(window, [&](const Coordinates & id)
915 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100916 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
917 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
918 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100919
Georgios Pinitascdf51452017-08-31 14:21:36 +0100920 // Get power of 2 in case of l2 pooling
921 if(pooling_type == PoolingType::L2)
922 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100923 top_data = vmul_f16(top_data, top_data);
924 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100925 }
926
927 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100928 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000929 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100930 const float16x4_t scale_v = vdup_n_f16(scale);
931
932 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
933 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100934 }
935 else
936 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100937 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
938 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100939 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100940
941 // Calculate square-root in case of l2 pooling
942 if(pooling_type == PoolingType::L2)
943 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100944 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100945 }
946
947 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100948 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100949 },
950 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000951#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100952 ARM_COMPUTE_UNUSED(window_input);
953 ARM_COMPUTE_UNUSED(window);
954 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000955#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100956}
957
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000958template <typename T>
959void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000960{
961 Iterator input(_input, window_input);
962 Iterator output(_output, window);
963
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000964 /** NEON vector types */
965 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
966 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
967 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
968 using q16_t = typename wrapper::traits::promote_t<T>;
969 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
970 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
971
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000972 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000973 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
974 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
975 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
976 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000977 int pool_stride_x = 0;
978 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000979 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000980 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
981 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000982
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100983 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
984 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +0100985
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000986 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
987 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
988 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
989
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000990 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
991 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
992 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000993
994 execute_window_loop(window, [&](const Coordinates & id)
995 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000996 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
997 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
998 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
999 q8x8_t fres = {};
1000 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001001
1002 if(pooling_type == PoolingType::AVG)
1003 {
1004 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001005 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1006 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1007 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001008
1009 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001010 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001011 {
1012 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001013 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1014 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001015 }
1016 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001017 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001018 {
1019 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001020 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1021 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001022 }
1023 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001024 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001025 {
1026 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001027 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1028 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001029 }
1030 };
1031 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001032 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001033 {
1034 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001035 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1036 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001037 }
1038 };
1039 if(pool_stride_x == 2)
1040 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001041 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001042 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001043 wrapper::vgetlane(final_sum.val[0], 0),
1044 wrapper::vgetlane(final_sum.val[0], 2),
1045 wrapper::vgetlane(final_sum.val[0], 4),
1046 wrapper::vgetlane(final_sum.val[0], 6),
1047 wrapper::vgetlane(final_sum.val[1], 0),
1048 wrapper::vgetlane(final_sum.val[1], 2),
1049 wrapper::vgetlane(final_sum.val[1], 4),
1050 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001051 };
1052
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001053 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1054 pool_size, upper_bound_w, upper_bound_h,
1055 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1056 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001057 }
1058 else
1059 {
1060 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001061 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1062 pool_size, upper_bound_w, upper_bound_h,
1063 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001064 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001065 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1066 pool_size, upper_bound_w, upper_bound_h,
1067 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1068 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001069 }
1070 }
1071 else
1072 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001073 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1074 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1075 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1076 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001077
1078 if(pool_stride_x == 2)
1079 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001080 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1081 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1082 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001083 }
1084 else
1085 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001086 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001087 }
1088 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001089
1090 // Store result
1091 if(pool_stride_x == 1)
1092 {
1093 if(input_qinfo != output_qinfo)
1094 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001095 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001096 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001097 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001098 }
1099 else
1100 {
1101 if(input_qinfo != output_qinfo)
1102 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001103 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001104 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001105 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001106 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001107 },
1108 input, output);
1109}
1110
Pablo Tello77e6c552018-12-04 15:33:49 +00001111void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001112{
Pablo Tello77e6c552018-12-04 15:33:49 +00001113 ARM_COMPUTE_UNUSED(pooling_type);
1114 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001115#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1116 Iterator input(_input, window_input);
1117 Iterator output(_output, window);
1118
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001119 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1120 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1121 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1122 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1123 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1124 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001125 int pool_stride_x = 0;
1126 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001127 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001128 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1129 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1130
1131 execute_window_loop(window, [&](const Coordinates & id)
1132 {
1133 float16_t res = 0.0f;
1134 float16x8_t vres = vdupq_n_f16(0.0f);
1135
1136 if(pooling_type != PoolingType::MAX)
1137 {
1138 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001139 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001140
1141 // Perform pooling
1142
1143 for(int y = 0; y < pool_size_y; ++y)
1144 {
1145 int x = 0;
1146 for(; x <= (pool_size_x - 8); x += 8)
1147 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001148 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1149 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001150
1151 // Get power of 2 in case of l2 pooling and accumulate
1152 if(pooling_type == PoolingType::L2)
1153 {
1154 vres = vaddq_f16(vres, vmulq_f16(data, data));
1155 }
1156 else
1157 {
1158 vres = vaddq_f16(vres, data);
1159 }
1160 }
1161
1162 // Leftover for loop
1163 for(; x < pool_size_x; ++x)
1164 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001165 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1166 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001167
1168 // Get power of 2 in case of l2 pooling
1169 if(pooling_type == PoolingType::L2)
1170 {
1171 data *= data;
1172 }
1173
1174 res += data;
1175 }
1176 }
1177
1178 // Reduction
1179 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1180 res += vget_lane_f16(tmp, 0);
1181 res += vget_lane_f16(tmp, 1);
1182 res += vget_lane_f16(tmp, 2);
1183 res += vget_lane_f16(tmp, 3);
1184
1185 // Divide by scale
1186 res *= scale;
1187 }
1188 else
1189 {
1190 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1191 res = std::numeric_limits<float>::lowest();
1192
1193 for(int y = 0; y < pool_size_y; ++y)
1194 {
1195 int x = 0;
1196 for(; x <= (pool_size_x - 8); x += 8)
1197 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001198 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1199 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001200 vres = vmaxq_f16(vres, data);
1201 }
1202
1203 // Leftover for loop
1204 for(; x < pool_size_x; ++x)
1205 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001206 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1207 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1208 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001209 }
1210 }
1211
1212 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1213 res = std::max(res, vget_lane_f16(tmp, 0));
1214 res = std::max(res, vget_lane_f16(tmp, 1));
1215 res = std::max(res, vget_lane_f16(tmp, 2));
1216 res = std::max(res, vget_lane_f16(tmp, 3));
1217 }
1218
1219 // Calculate square-root in case of l2 pooling
1220 if(pooling_type == PoolingType::L2)
1221 {
1222 res = std::sqrt(res);
1223 }
1224
1225 // Store result
1226 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1227 },
1228 input, output);
1229
1230#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1231 ARM_COMPUTE_UNUSED(window_input);
1232 ARM_COMPUTE_UNUSED(window);
1233 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1234#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1235}
1236
Pablo Tello77e6c552018-12-04 15:33:49 +00001237void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001238{
Pablo Tello77e6c552018-12-04 15:33:49 +00001239 ARM_COMPUTE_UNUSED(pooling_type);
1240 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001241#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1242 Iterator input(_input, window_input);
1243 Iterator output(_output, window);
1244
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001245 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1246 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1247 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1248 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1249 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1250 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001251 int pool_stride_x = 0;
1252 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001253 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001254 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1255 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1256
1257 float16x8_t vres;
1258
1259 execute_window_loop(window, [&](const Coordinates & id)
1260 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001261 const int idx_width = id.y() * pool_stride_x;
1262 const int idx_height = id.z() * pool_stride_y;
1263 const int pool_limit_y = pool_pad_top - idx_height;
1264 const int pool_limit_x = pool_pad_left - idx_width;
1265
1266 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1267 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1268 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1269 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1270
Michalis Spyrou57dac842018-03-01 16:03:50 +00001271 if(pooling_type != PoolingType::MAX)
1272 {
1273 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001274 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1275 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001276 const float16x8_t scale_v = vdupq_n_f16(scale);
1277
1278 // Perform pooling
1279 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001280 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001281 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001282 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001283 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001284 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1285 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001286
1287 // Get power of 2 in case of l2 pooling and accumulate
1288 if(pooling_type == PoolingType::L2)
1289 {
1290 vres = vaddq_f16(vres, vmulq_f16(data, data));
1291 }
1292 else
1293 {
1294 vres = vaddq_f16(vres, data);
1295 }
1296 }
1297 }
1298 // Divide by scale
1299 vres = vmulq_f16(vres, scale_v);
1300 }
1301 else
1302 {
1303 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001304
1305 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001306 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001307 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001308 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001309 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1310 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001311 vres = vmaxq_f16(vres, data);
1312 }
1313 }
1314 }
1315
1316 // Calculate square-root in case of l2 pooling
1317 if(pooling_type == PoolingType::L2)
1318 {
1319 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1320 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1321 }
1322
1323 // Store result
1324 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1325 },
1326 input, output);
1327
1328#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1329 ARM_COMPUTE_UNUSED(window_input);
1330 ARM_COMPUTE_UNUSED(window);
1331 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1332#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1333}
1334
Pablo Tello77e6c552018-12-04 15:33:49 +00001335void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001336{
1337 Iterator input(_input, window_input);
1338 Iterator output(_output, window);
1339
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001340 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1341 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1342 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1343 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1344 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1345 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001346 int pool_stride_x = 0;
1347 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001348 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001349 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1350 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001351
1352 execute_window_loop(window, [&](const Coordinates & id)
1353 {
1354 float res = 0.0f;
1355
1356 if(pooling_type != PoolingType::MAX)
1357 {
1358 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001359 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001360
1361 // Perform pooling
1362 float32x4_t vres = vdupq_n_f32(0.0f);
1363
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001364 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001365 {
1366 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001367 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001368 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001369 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1370 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001371
1372 // Get power of 2 in case of l2 pooling and accumulate
1373 if(pooling_type == PoolingType::L2)
1374 {
1375 vres = vmlaq_f32(vres, data, data);
1376 }
1377 else
1378 {
1379 vres = vaddq_f32(vres, data);
1380 }
1381 }
1382
1383 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001384 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001385 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001386 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1387 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001388
1389 // Get power of 2 in case of l2 pooling
1390 if(pooling_type == PoolingType::L2)
1391 {
1392 data *= data;
1393 }
1394
1395 res += data;
1396 }
1397 }
1398
1399#if defined(__aarch64__)
1400 // Reduction operation available on 64 bit architectures only
1401 res += vaddvq_f32(vres);
1402#else // __aarch64__
1403 // Reduction
1404 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1405 tmp = vpadd_f32(tmp, tmp);
1406
1407 res += vget_lane_f32(tmp, 0);
1408#endif // __aarch64__
1409 // Divide by scale
1410 res *= scale;
1411 }
1412 else
1413 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001414 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1415 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001416
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001417 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001418 {
1419 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001420 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001421 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001422 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1423 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001424 vres = vmaxq_f32(vres, data);
1425 }
1426
1427 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001428 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001429 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001430 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1431 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001432 res = std::max(res, data);
1433 }
1434 }
1435
1436#if defined(__aarch64__)
1437 // Reduction operation available on 64 bit architectures only
1438 res = std::max(vmaxvq_f32(vres), res);
1439#else // __aarch64__
1440 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1441 tmp = vpmax_f32(tmp, tmp);
1442
1443 res = std::max(res, vget_lane_f32(tmp, 0));
1444#endif // __aarch64__
1445 }
1446
1447 // Calculate square-root in case of l2 pooling
1448 if(pooling_type == PoolingType::L2)
1449 {
1450 res = std::sqrt(res);
1451 }
1452
1453 // Store result
1454 *(reinterpret_cast<float *>(output.ptr())) = res;
1455 },
1456 input, output);
1457}
1458
Pablo Tello77e6c552018-12-04 15:33:49 +00001459void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1460{
1461 Iterator input(_input, window_input);
1462 Iterator output(_output, window);
1463
1464 constexpr int pool_size = 2;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001465 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1466 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1467 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1468 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001469 int pool_stride_x = 0;
1470 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001471 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001472 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1473 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1474
1475 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1476 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1477
1478 execute_window_loop(window, [&](const Coordinates & id)
1479 {
1480 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1481 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1482 float32x2_t res = {};
1483 float final_res = 0;
1484
1485 // Get power of 2 in case of l2 pooling
1486 if(pooling_type == PoolingType::L2)
1487 {
1488 top_data = vmul_f32(top_data, top_data);
1489 bottom_data = vmul_f32(bottom_data, bottom_data);
1490 }
1491
1492 if(pooling_type != PoolingType::MAX)
1493 {
1494 // Calculate scale
1495 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1496 const float32x2_t scale_v = vdup_n_f32(scale);
1497
1498 // Perform pooling
1499 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1500 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1501 }
1502 else
1503 {
1504 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1505 res = vpmax_f32(max_data, max_data);
1506 }
1507 final_res = vget_lane_f32(res, 0);
1508
1509 // Calculate square-root in case of l2 pooling
1510 if(pooling_type == PoolingType::L2)
1511 {
1512 final_res = sqrt(final_res);
1513 }
1514
1515 // Store result
1516 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1517 },
1518 input, output);
1519}
1520
1521void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1522{
1523 Iterator input(_input, window_input);
1524 Iterator output(_output, window);
1525
1526 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001527 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1528 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1529 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1530 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001531 int pool_stride_x = 0;
1532 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001533 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001534 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1535 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1536
1537 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1538 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1539 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1540
1541 execute_window_loop(window, [&](const Coordinates & id)
1542 {
1543 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1544 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1545 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1546 float32x2_t res = {};
1547 float final_res = 0;
1548
1549 // Get power of 2 in case of l2 pooling
1550 if(pooling_type == PoolingType::L2)
1551 {
1552 top_data = vmulq_f32(top_data, top_data);
1553 middle_data = vmulq_f32(middle_data, middle_data);
1554 bottom_data = vmulq_f32(bottom_data, bottom_data);
1555 }
1556
1557 if(pooling_type != PoolingType::MAX)
1558 {
1559 // Calculate scale
1560 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1561 const float32x2_t scale_v = vdup_n_f32(scale);
1562
1563 // Perform pooling
1564 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1565 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1566 res = vmul_f32(vpadd_f32(res, res), scale_v);
1567 }
1568 else
1569 {
1570 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1571 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1572 res = vpmax_f32(res, res);
1573 }
1574 final_res = vget_lane_f32(res, 0);
1575
1576 // Calculate square-root in case of l2 pooling
1577 if(pooling_type == PoolingType::L2)
1578 {
1579 final_res = sqrt(final_res);
1580 }
1581
1582 // Store result
1583 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1584 },
1585 input, output);
1586}
1587
1588void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1589{
1590 Iterator input(_input, window_input);
1591 Iterator output(_output, window);
1592
1593 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001594 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1595 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1596 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1597 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001598 int pool_stride_x = 0;
1599 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001600 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001601 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1602 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1603
1604 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1605 for(int i = 0; i < pool_size; ++i)
1606 {
1607 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1608 }
1609
1610 execute_window_loop(window, [&](const Coordinates & id)
1611 {
1612 float32x2_t res = {};
1613 float final_res = 0.f;
1614 if(pooling_type != PoolingType::MAX)
1615 {
1616 // Calculate scale
1617 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1618 const float32x2_t scale_v = vdup_n_f32(scale);
1619
1620 // Perform pooling
1621 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1622 // Get power of 2 in case of l2 pooling
1623 if(pooling_type == PoolingType::L2)
1624 {
1625 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1626 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1627 }
1628 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1629 for(int i = 1; i < pool_size; ++i)
1630 {
1631 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1632 // Get power of 2 in case of l2 pooling
1633 if(pooling_type == PoolingType::L2)
1634 {
1635 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1636 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1637 }
1638 sum_data = vaddq_f32(sum_data, data.val[0]);
1639 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1640 }
1641 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1642 res = vmul_f32(vpadd_f32(res, res), scale_v);
1643 }
1644 else
1645 {
1646 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1647 for(int i = 1; i < pool_size; ++i)
1648 {
1649 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1650 max_data = vmax2q_f32(max_data, data);
1651 }
1652 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1653 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1654 res = vpmax_f32(res, res);
1655 }
1656 final_res = vget_lane_f32(res, 0);
1657
1658 // Calculate square-root in case of l2 pooling
1659 if(pooling_type == PoolingType::L2)
1660 {
1661 final_res = sqrt(final_res);
1662 }
1663
1664 // Store result
1665 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1666 },
1667 input, output);
1668}
1669
1670void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001671{
1672 Iterator input(_input, window_input);
1673 Iterator output(_output, window);
1674
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001675 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1676 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1677 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1678 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1679 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1680 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001681 int pool_stride_x = 0;
1682 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001683 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001684 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1685 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1686
1687 float32x4_t vres;
1688
1689 execute_window_loop(window, [&](const Coordinates & id)
1690 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001691 const int idx_width = id.y() * pool_stride_x;
1692 const int idx_height = id.z() * pool_stride_y;
1693 const int pool_limit_y = pool_pad_top - idx_height;
1694 const int pool_limit_x = pool_pad_left - idx_width;
1695
1696 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1697 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1698 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1699 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1700
Michalis Spyrou57dac842018-03-01 16:03:50 +00001701 if(pooling_type != PoolingType::MAX)
1702 {
1703 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001704 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1705 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001706 const float32x4_t scale_v = vdupq_n_f32(scale);
1707
1708 // Perform pooling
1709 vres = vdupq_n_f32(0.0f);
1710
Michalis Spyrouced25572018-10-01 16:26:20 +01001711 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001712 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001713 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001714 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001715 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1716 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001717
1718 // Get power of 2 in case of l2 pooling and accumulate
1719 if(pooling_type == PoolingType::L2)
1720 {
1721 vres = vmlaq_f32(vres, data, data);
1722 }
1723 else
1724 {
1725 vres = vaddq_f32(vres, data);
1726 }
1727 }
1728 }
1729 // Divide by scale
1730 vres = vmulq_f32(vres, scale_v);
1731 }
1732 else
1733 {
1734 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001735 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001736 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001737 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001738 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001739 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1740 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001741 vres = vmaxq_f32(vres, data);
1742 }
1743 }
1744 }
1745
1746 // Calculate square-root in case of l2 pooling
1747 if(pooling_type == PoolingType::L2)
1748 {
Georgios Pinitas27f223d2019-12-16 19:23:02 +00001749 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1750 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1751 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1752 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
1753 };
1754 vres = l2_res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00001755 }
1756
1757 // Store result
1758 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1759 },
1760 input, output);
1761}
1762
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001763template <typename T>
1764void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001765{
1766 Iterator input(_input, window_input);
1767 Iterator output(_output, window);
1768
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001769 /** NEON vector types */
1770 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1771 using q16_t = typename wrapper::traits::promote_t<T>;
1772 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1773 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1774 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1775
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001776 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1777 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1778 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1779 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1780 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1781 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001782 int pool_stride_x = 0;
1783 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001784 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001785 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1786 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001787
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001788 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1789 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1790
Georgios Pinitas55186712018-01-08 17:37:12 +00001791 execute_window_loop(window, [&](const Coordinates & id)
1792 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001793 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00001794
1795 if(pooling_type != PoolingType::MAX)
1796 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001797 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1798 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001799
1800 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001801 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001802
1803 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001804 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001805 {
1806 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001807 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001808 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001809 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1810 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001811
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001812 const q16x8_t data_q16 = wrapper::vmovl(data);
1813 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001814 }
1815
1816 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001817 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001818 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001819 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1820 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00001821 sres += data;
1822 }
1823 }
1824
1825 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001826 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
1827 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00001828
1829 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001830 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00001831 }
1832 else
1833 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001834 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00001835
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001836 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001837 {
1838 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001839 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001840 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001841 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1842 (_input->info()->strides_in_bytes().y())));
1843 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001844 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001845 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001846 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001847 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001848 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1849 (_input->info()->strides_in_bytes().y())));
1850 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00001851 }
1852 }
1853
1854 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001855 vres = wrapper::vpmax(vres, vres);
1856 vres = wrapper::vpmax(vres, vres);
1857 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00001858
1859 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001860 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00001861 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001862 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001863 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
1864 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00001865 },
1866 input, output);
1867}
1868
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001869template <typename T>
1870void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001871{
1872 Iterator input(_input, window_input);
1873 Iterator output(_output, window);
1874
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001875 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1876 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1877 using q16_t = typename wrapper::traits::promote_t<T>;
1878 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1879 using q32_t = typename wrapper::traits::promote_t<q16_t>;
1880 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
1881
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001882 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1883 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1884 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1885 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1886 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1887 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001888
1889 int pool_stride_x = 0;
1890 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001891 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001892 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1893 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1894
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001895 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
1896 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
1897 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00001898
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00001899 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001900 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
1901 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00001902 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001903
1904 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1905 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1906 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1907
Michalis Spyrou57dac842018-03-01 16:03:50 +00001908 execute_window_loop(window, [&](const Coordinates & id)
1909 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001910 const int idx_width = id.y() * pool_stride_x;
1911 const int idx_height = id.z() * pool_stride_y;
1912 const int pool_limit_y = pool_pad_top - idx_height;
1913 const int pool_limit_x = pool_pad_left - idx_width;
1914
1915 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1916 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1917 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1918 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1919
Michalis Spyrou57dac842018-03-01 16:03:50 +00001920 if(pooling_type != PoolingType::MAX)
1921 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001922 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1923 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1924 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
1925 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00001926
1927 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001928 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1929 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001930
1931 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01001932 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001933 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001934 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001935 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001936 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1937 (_input->info()->strides_in_bytes().z())));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001938
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001939 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
1940 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
1941 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
1942 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
1943 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
1944 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001945 }
1946 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001947
Pablo Telloa52e4cf2019-04-01 14:55:18 +01001948 if(input_qinfo != output_qinfo)
1949 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001950 const float32x4x4_t vres =
1951 {
1952 {
1953 vcvtq_f32_q32(vres1),
1954 vcvtq_f32_q32(vres2),
1955 vcvtq_f32_q32(vres3),
1956 vcvtq_f32_q32(vres4),
1957 }
1958 };
1959 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
1960 // Store result
1961 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
1962 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
Pablo Telloa52e4cf2019-04-01 14:55:18 +01001963 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001964 else
1965 {
1966 const float32x4_t scale_v = vdupq_n_f32(scale);
1967 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
1968 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
1969 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
1970 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
1971 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001972
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001973 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
1974 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
1975 // Store result
1976 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
1977 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
1978 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001979 }
1980 else
1981 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001982 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00001983
Michalis Spyrouced25572018-10-01 16:26:20 +01001984 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001985 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001986 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001987 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001988 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1989 (_input->info()->strides_in_bytes().z())));
1990 vres = wrapper::vmax(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001991 }
1992 }
1993
1994 // Store result
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001995 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001996 }
1997 },
1998 input, output);
1999}
2000
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002001Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
2002{
2003 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2004
2005 unsigned int pooled_w = 0;
2006 unsigned int pooled_h = 0;
2007 unsigned int num_elems_processed_per_iteration = 0;
2008 BorderSize border_size(0);
2009
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002010 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002011 unsigned int pool_size_x = 0;
2012 unsigned int pool_size_y = 0;
2013
2014 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002015 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2016 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2017 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002018
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002019 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2020 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002021
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002022 // Validate pool info before calling scaled_dimensions
2023 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002024
2025 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002026 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2027 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002028 pool_size_x,
2029 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002030 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002031
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002032 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002033 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
2034 pool_size_x, pool_size_y)
2035 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002036
2037 return Status{};
2038}
2039
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002040void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002041{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002042 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002043 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2044 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2045 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2046
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002047 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2048 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2049 const unsigned int pool_size = _pool_info.pool_size.width;
2050 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002051
Michalis Spyrou57dac842018-03-01 16:03:50 +00002052 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002053 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002054 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002055 // Set step for input in x and y direction for the input
2056 unsigned int window_x_inc = 0;
2057 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002058 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002059 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002060 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002061 {
2062 window_x_inc = pool_stride_x;
2063 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2064 {
2065 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2066 }
2067 break;
2068 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002069
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002070 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002071 case DataType::F32:
2072 {
2073 window_x_inc = pool_stride_x;
2074 break;
2075 }
2076 default:
2077 {
2078 ARM_COMPUTE_ERROR("Not supported");
2079 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002080 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002081 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2082 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002083 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002084 else
2085 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01002086 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002087 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2088 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2089 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002090
2091 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002092 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002093}
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002094} // namespace arm_compute