blob: 1310ef3521deda770be72abf43c92831d3e3226f [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
34#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010035#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010036#include "src/core/NEON/NEAsymm.h"
37#include "src/core/NEON/NEFixedPoint.h"
38#include "src/core/NEON/NEMath.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000039#include "support/ToolchainSupport.h"
40
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010041#include "src/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042#include <algorithm>
43#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010044#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010046#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047#include <string>
48#include <tuple>
49
Manuel Bottinib4bb8272019-12-18 18:01:27 +000050namespace arm_compute
51{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010052using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010053
54namespace
55{
Michalis Spyroucffb2a32020-09-08 16:26:38 +010056template <typename T>
57inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
58quantize(float val, const UniformQuantizationInfo &info)
59{
60 return quantize_qasymm8_signed(val, info);
61}
62
63template <typename T>
64inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
65quantize(float val, const UniformQuantizationInfo &info)
66{
67 return quantize_qasymm8(val, info);
68}
69
Pablo Tello77e6c552018-12-04 15:33:49 +000070inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
72{
Michalis Spyrou57dac842018-03-01 16:03:50 +000073 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
74 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
75
76 int start_x = id[idx_width] * stride_x - pad_x;
77 int start_y = id[idx_height] * stride_y - pad_y;
78
79 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
80 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000081 if(exclude_padding)
82 {
83 start_x = std::max(0, start_x);
84 start_y = std::max(0, start_y);
85 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010086 return 1.f / ((end_y - start_y) * (end_x - start_x));
87}
88
Manuel Bottinib4bb8272019-12-18 18:01:27 +000089template <typename T, typename TVec>
90inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000091 const int pool_size, const int upper_bound_w, const int upper_bound_h,
92 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
93{
94 int start_x = (id.x() + id_offset) * stride_x - pad_x;
95 int start_y = id.y() * stride_y - pad_y;
96 const int end_y = std::min(start_y + pool_size, upper_bound_h);
97 if(exclude_padding)
98 {
99 start_y = std::max(0, start_y);
100 }
101
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000102 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +0000103 {
104 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000105 wrapper::vgetlane(v, 0),
106 wrapper::vgetlane(v, 1),
107 wrapper::vgetlane(v, 2),
108 wrapper::vgetlane(v, 3),
109 wrapper::vgetlane(v, 4),
110 wrapper::vgetlane(v, 5),
111 wrapper::vgetlane(v, 6),
112 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000113 }
114 };
115
116 for(auto &el : elems)
117 {
118 int c_start_x = start_x;
119 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
120 if(exclude_padding)
121 {
122 c_start_x = std::max(0, c_start_x);
123 }
124 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
125 el *= scale;
126 start_x += step * stride_x;
127 }
128
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000129 v = wrapper::vsetlane(elems[0], v, 0);
130 v = wrapper::vsetlane(elems[1], v, 1);
131 v = wrapper::vsetlane(elems[2], v, 2);
132 v = wrapper::vsetlane(elems[3], v, 3);
133 v = wrapper::vsetlane(elems[4], v, 4);
134 v = wrapper::vsetlane(elems[5], v, 5);
135 v = wrapper::vsetlane(elems[6], v, 6);
136 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000137}
138
morgolockcc1f6c92020-03-24 09:26:48 +0000139Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
140 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000142 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000144 int pool_stride_x = 0;
145 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000146 PoolingType pool_type = pool_info.pool_type;
147 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100149
Anthony Barbiereaefd002018-07-20 17:49:35 +0100150 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000151 if(indices)
152 {
morgolock37722d92020-04-09 14:17:48 +0100153 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
morgolockcc1f6c92020-03-24 09:26:48 +0000154 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
155 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
156 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000157 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000158 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000159 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
160 && (input->data_layout() == DataLayout::NHWC),
161 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000162
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000163 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100164 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000165 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000166 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
167 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
168 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000169
170 if(indices)
171 {
172 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
morgolockcc1f6c92020-03-24 09:26:48 +0000173 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
174 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
175 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100176 }
177
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000178 return Status{};
179}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100180
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000181Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000182{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000183 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
184 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000185
186 return Status{};
187}
188
morgolockcc1f6c92020-03-24 09:26:48 +0000189std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
190 unsigned int &num_elems_processed_per_iteration,
191 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000192 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000193{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100194 // Output auto inizialitation if not yet initialized
195 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000196 if(indices)
197 {
198 // Indices auto inizialitation if not yet initialized
morgolocke383c352020-04-03 16:57:46 +0100199 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
200 pool_info)))
201 .set_data_type(DataType::U32) /* we store the offset to the element */);
morgolockcc1f6c92020-03-24 09:26:48 +0000202 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000203 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000204 unsigned int num_elems_read_per_iteration = 0;
205 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000206 int pool_stride_x = 0;
207 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000208 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
209 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
210 const int input_width = input->dimension(idx_width);
211 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000212 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000213 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000214 const int pool_pad_right = pad_stride_info.pad_right();
215 const int pool_pad_top = pad_stride_info.pad_top();
216 const int pool_pad_left = pad_stride_info.pad_left();
217 const int pool_pad_bottom = pad_stride_info.pad_bottom();
218 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000219
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000220 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000221 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
222 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000223 pool_size_x,
224 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000225 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100226
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000227 //If it's not squared and optimized will be executed the MxN
228 num_elems_read_per_iteration = 1;
229 num_elems_processed_per_iteration = 1;
230 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100231
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000232 if(is_square)
233 {
234 switch(input->data_type())
235 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000236 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000237 case DataType::QASYMM8_SIGNED:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000238 switch(pool_size_x)
239 {
240 case 2:
241 num_elems_read_per_iteration = 16;
242 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
243 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
244 break;
245 case 3:
246 num_elems_read_per_iteration = 16;
247 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
248 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
249 break;
250 default:
251 break;
252 }
253 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000254#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
255 case DataType::F16:
256 switch(pool_size_x)
257 {
258 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000259 case 3:
260 num_elems_read_per_iteration = 4;
261 num_elems_processed_per_iteration = 1;
262 num_elems_horizontal_window = 1;
263 break;
264 default:
265 break;
266 }
267 break;
268#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
269 case DataType::F32:
270 switch(pool_size_x)
271 {
272 case 2:
273 num_elems_read_per_iteration = 2;
274 break;
275 case 3:
276 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
277 break;
278 case 7:
279 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
280 break;
281 default:
282 break;
283 }
284 num_elems_processed_per_iteration = 1;
285 num_elems_horizontal_window = 1;
286 break;
287 default:
288 ARM_COMPUTE_ERROR("Element size not supported");
289 break;
290 }
291 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000292
293 bool window_changed = false;
294 Window win{};
295 if(data_layout == DataLayout::NCHW)
296 {
297 // Number of iterations in X dimension
298 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000299 // Upper limit for the number of right/bottom border elements that are accessed
300 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
301 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000302 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
303 border_size.right = std::max(upper_bound_w, pool_pad_right);
304 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000305 TensorShape output_shape{ input->tensor_shape() };
306 output_shape.set(0, pooled_w);
307 output_shape.set(1, pooled_h);
308 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000309 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000310 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000311 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000312 if(indices)
313 {
314 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
315 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
316 }
317 else
318 {
319 window_changed = update_window_and_padding(win, input_access, output_access);
320 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000321 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
322 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000323
324 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
325 return std::make_pair(err, win);
326}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000327
328template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000329inline T vcvtq_q32_f32(float32x4_t values);
330
331template <>
332inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
333{
334 return vcvtq_u32_f32(values);
335}
336
337template <>
338inline int32x4_t vcvtq_q32_f32(float32x4_t values)
339{
340 return vcvtq_s32_f32(values);
341}
342
343template <typename T>
344inline float32x4_t vcvtq_f32_q32(T values);
345
346template <>
347inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
348{
349 return vcvtq_f32_u32(values);
350}
351
352template <>
353inline float32x4_t vcvtq_f32_q32(int32x4_t values)
354{
355 return vcvtq_f32_s32(values);
356}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000357
358template <typename Tout>
359inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
360
361template <>
362inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
363{
364 const float new_scale = quant_rescale / scale_pooling;
365 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
366}
367
368template <>
369inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
370{
371 const float new_scale = quant_rescale / scale_pooling;
372 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
373}
374
375template <typename Tin, typename Tout>
376inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
377
378template <>
379inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
380{
381 const float32x4x4_t acc =
382 {
383 {
384 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
385 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
386 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
387 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
388 }
389 };
390 return vquantize(acc, requant_qinfo);
391}
392
393template <>
394inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
395{
396 const float32x4x4_t acc =
397 {
398 {
399 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
400 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
401 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
402 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
403 }
404 };
405 return vquantize_signed(acc, requant_qinfo);
406}
407
408template <typename T>
409inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
410
411template <>
412inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
413{
414 const float32x4x2_t acc =
415 {
416 {
417 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
418 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
419 }
420 };
421 return vquantize(acc, requant_qinfo);
422}
423
424template <>
425inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
426{
427 const float32x4x2_t acc =
428 {
429 {
430 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
431 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
432 }
433 };
434 return vquantize_signed(acc, requant_qinfo);
435}
436
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000437} // namespace
438
439NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000440 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000441{
442}
443
444BorderSize NEPoolingLayerKernel::border_size() const
445{
446 return _border_size;
447}
448
morgolockcc1f6c92020-03-24 09:26:48 +0000449void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000450{
451 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000452 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
453 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000454 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000455
456 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000457 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
458 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
459 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000460
461 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000462 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000463 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
464 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000465
466 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000467 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000468
469 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100470 unsigned int pooled_w;
471 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000472 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
473 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000474 pool_size.x(),
475 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000476 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000477
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000478 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000479 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100480
481 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000482 _input = input;
483 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000484 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000485 _pool_info = pool_info;
486 _data_layout = input->info()->data_layout();
487 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100488
Georgios Pinitas55186712018-01-08 17:37:12 +0000489 // Get data type
490 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000491 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000492
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100493 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000494 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100495 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000496 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100497 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
498 }
499 else
500 {
501 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100502 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000503 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000504 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100505 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000506 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000507 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000508 }
509 else
510 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000511 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000512 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000513 }
514 }
515 else if(data_type == DataType::QASYMM8_SIGNED)
516 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100517 if(!is_nchw)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000518 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100519 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
520 }
521 else
522 {
523 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000524 {
525 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
526 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100527 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000528 {
529 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
530 }
531 else
532 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000533 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
534 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000535 }
536 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000537 else if(data_type == DataType::F16)
538 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100539 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000540 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100541 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000542 }
543 else
544 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100545 if(_is_square)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000546 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100547 switch(pool_size.x())
548 {
549 case 2:
550 {
551 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
552 }
553 break;
554 case 3:
555 {
556 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
557 }
558 break;
559 default:
560 {
561 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
562 break;
563 }
564 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000565 }
566 else
567 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100568 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000569 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000570 }
571 }
572 else if(data_type == DataType::F32)
573 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100574 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000575 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100576 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000577 }
578 else
579 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100580 if(_is_square)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000581 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100582 switch(pool_size.x())
583 {
584 case 2:
585 {
586 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
587 break;
588 }
589 case 3:
590 {
591 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
592 break;
593 }
594 case 7:
595 {
596 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
597 break;
598 }
599 default:
600 {
601 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
602 break;
603 }
604 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000605 }
606 else
607 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100608 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000609 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000610 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100611 }
612
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100613 if(!is_nchw)
614 {
615 // Configure kernel window
616 Window win = calculate_max_window(*output->info(), Steps());
617 Coordinates coord;
618 coord.set_num_dimensions(output->info()->num_dimensions());
619 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
620 INEKernel::configure(win);
621 }
622 else
623 {
624 // Configure kernel window
625 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
626 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
627 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
628 INEKernel::configure(win_config.second);
629 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100630}
631
Sheri Zhang996c7772020-08-10 12:02:59 +0100632template <typename T>
Sheri Zhange0681992020-07-14 15:29:28 +0100633inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
634{
635 const int pad_left = info.padding().left;
636 const int pad_right = info.padding().right;
637 const int pad_top = info.padding().top;
638 const int pad_bottom = info.padding().bottom;
639 const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
640 const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
641 const int pad_horiz = pad_left + pad_right;
642 const int pad_vert = pad_top + pad_bottom;
643
644 if(info.data_layout() == DataLayout::NCHW)
645 {
646 const uint32_t offset_base = padded_offset
647 - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
648 - pad_top * sizeof(T) /* top padding */
649 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
650 - in_stride_w * id[3];
651
652 return offset_base;
653 }
654 else
655 {
656 const uint32_t offset_base = padded_offset
657 - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
658 - pad_top * sizeof(T) // top padding
659 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
660 - in_stride_w * id[3];
661
662 return offset_base;
663 }
664}
665
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000666template <typename T>
667void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000668{
669 Iterator input(_input, window_input);
670 Iterator output(_output, window);
671
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000672 /** NEON vector types */
673 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
674 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
675 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
676 using q16_t = typename wrapper::traits::promote_t<T>;
677 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
678 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
679 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
680
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000681 constexpr int pool_size = 2;
682 int pool_stride_x = 0;
683 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000684 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
685 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
686 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
687 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
688 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000689 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
690 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000691
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000692 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
693 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000694
695 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
696
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100697 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
698 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
699 const bool have_different_qinfo = input_qinfo != output_qinfo;
700
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000701 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
702 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
703 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
704
Georgios Pinitas55186712018-01-08 17:37:12 +0000705 execute_window_loop(window, [&](const Coordinates & id)
706 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000707 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
708 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
709 q8x8_t lower_res = {};
710 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000711
712 if(pooling_type != PoolingType::MAX)
713 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000714 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
715 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000716
717 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000718 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000719 {
720 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000721 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
722 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000723 }
724 };
725
726 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000727 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
728 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000729
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000730 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000731
732 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000733 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
734 pool_size, upper_bound_w, upper_bound_h,
735 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
736 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000737
738 // Compute upper result for stride_x == 1
739 if(pool_stride_x == 1)
740 {
741 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000742 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000743 {
744 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000745 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
746 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000747 }
748 };
749
750 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000751 q16x8_t res_upper = wrapper::vcombine(
752 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
753 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000754
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000755 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000756 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
757 pool_size, upper_bound_w, upper_bound_h,
758 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
759 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000760 }
761 }
762 else
763 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000764 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
765 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000766 if(pool_stride_x == 1)
767 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000768 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
769 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000770 }
771 }
772
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100773 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100774 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000775 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000776 lower_res = wrapper::vgetlow(requantized_output);
777 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100778 }
779
Georgios Pinitas55186712018-01-08 17:37:12 +0000780 // Store result
781 if(pool_stride_x == 1)
782 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000783 const q8x8x2_t res = { { lower_res, upper_res } };
784 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000785 }
786 else
787 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000788 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000789 }
790 },
791 input, output);
792}
793
Pablo Tello77e6c552018-12-04 15:33:49 +0000794void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100795{
Pablo Tello77e6c552018-12-04 15:33:49 +0000796 ARM_COMPUTE_UNUSED(pooling_type);
797 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000798#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100799 Iterator input(_input, window_input);
800 Iterator output(_output, window);
801
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000802 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000803 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
804 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
805 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
806 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000807 int pool_stride_x = 0;
808 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000809 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000810 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
811 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100812
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000813 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
814 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
815 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100816
817 execute_window_loop(window, [&](const Coordinates & id)
818 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100819 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
820 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
821 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
822 float16x4_t res = {};
823
824 // Get power of 2 in case of l2 pooling
825 if(pooling_type == PoolingType::L2)
826 {
827 top_data = vmul_f16(top_data, top_data);
828 middle_data = vmul_f16(middle_data, middle_data);
829 bottom_data = vmul_f16(bottom_data, bottom_data);
830 }
831
832 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100833 {
834 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000835 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100836 const float16x4_t scale_v = vdup_n_f16(scale);
837 // Perform pooling
838 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
839 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
840 res = vmul_f16(vpadd_f16(res, res), scale_v);
841 }
842 else
843 {
844 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
845 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
846 res = vpmax_f16(res, res);
847 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100848
849 // Calculate square-root in case of l2 pooling
850 if(pooling_type == PoolingType::L2)
851 {
852 res = vinv_f16(vinvsqrt_f16(res));
853 }
854
Pablo Tello0c34fe22017-06-26 17:17:42 +0100855 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
856 },
857 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000858#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100859 ARM_COMPUTE_UNUSED(window_input);
860 ARM_COMPUTE_UNUSED(window);
861 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000862#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100863}
864
Sheri Zhange0681992020-07-14 15:29:28 +0100865#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
866template <typename T>
867inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
868f16_to_f32(float16x4_t input)
869{
870 float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
871 return output;
872}
873#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
874
875template <typename T>
876inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
877f16_to_f32(float32x2_t input)
878{
879 return input;
880}
881
Sheri Zhang996c7772020-08-10 12:02:59 +0100882template <typename T>
Sheri Zhange0681992020-07-14 15:29:28 +0100883void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
884{
885 Iterator input(_input, window_input);
886 Iterator output(_output, window);
887 Iterator indices(_indices, window);
888 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
889 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
890 int pool_stride_x = 0;
891 int pool_stride_y = 0;
892 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
893 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
894 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
895 const int pad_left = _input->info()->padding().left;
896 const int pad_right = _input->info()->padding().right;
897 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
898
899 execute_window_loop(window, [&](const Coordinates & id)
900 {
901 auto top_data = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
902 auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
903 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
904 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
905
906 // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
907 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
908 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
909 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
910 *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
911
912 // Calculate max data indice, which will be used in max unpool.
913 const uint32_t offset_base = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
914 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
915 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
916 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
917 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
918 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
919 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
920 *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
921 },
922 input, output, indices);
923}
924
Pablo Tello77e6c552018-12-04 15:33:49 +0000925void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100926{
Pablo Tello77e6c552018-12-04 15:33:49 +0000927 ARM_COMPUTE_UNUSED(pooling_type);
928 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000929#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +0100930 if(pooling_type == PoolingType::MAX && _indices)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100931 {
Sheri Zhange0681992020-07-14 15:29:28 +0100932 pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
933 }
934 else
935 {
936 Iterator input(_input, window_input);
937 Iterator output(_output, window);
938 constexpr int pool_size = 2;
939 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
940 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
941 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
942 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
943 int pool_stride_x, pool_stride_y = 0;
944 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
945 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
946 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100947
Sheri Zhange0681992020-07-14 15:29:28 +0100948 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
949 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
950
951 execute_window_loop(window, [&](const Coordinates & id)
Georgios Pinitascdf51452017-08-31 14:21:36 +0100952 {
Sheri Zhange0681992020-07-14 15:29:28 +0100953 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
954 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
955 float16x4_t res = {};
Georgios Pinitascdf51452017-08-31 14:21:36 +0100956
Sheri Zhange0681992020-07-14 15:29:28 +0100957 // Get power of 2 in case of l2 pooling
958 if(pooling_type == PoolingType::L2)
959 {
960 top_data = vmul_f16(top_data, top_data);
961 bottom_data = vmul_f16(bottom_data, bottom_data);
962 }
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100963
Sheri Zhange0681992020-07-14 15:29:28 +0100964 if(pooling_type != PoolingType::MAX)
965 {
966 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
967 const float16x4_t scale_v = vdup_n_f16(scale);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100968
Sheri Zhange0681992020-07-14 15:29:28 +0100969 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
970 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
971 }
972 else
973 {
974 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
975 res = vpmax_f16(max_data, max_data);
976 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100977
Sheri Zhange0681992020-07-14 15:29:28 +0100978 // Calculate square-root in case of l2 pooling
979 if(pooling_type == PoolingType::L2)
980 {
981 res = vinv_f16(vinvsqrt_f16(res));
982 }
983
984 // Store result
985 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
986 },
987 input, output);
988 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000989#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100990 ARM_COMPUTE_UNUSED(window_input);
991 ARM_COMPUTE_UNUSED(window);
992 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000993#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100994}
995
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000996template <typename T>
997void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000998{
999 Iterator input(_input, window_input);
1000 Iterator output(_output, window);
1001
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001002 /** NEON vector types */
1003 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1004 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1005 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1006 using q16_t = typename wrapper::traits::promote_t<T>;
1007 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1008 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1009
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001010 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001011 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1012 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1013 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1014 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001015 int pool_stride_x = 0;
1016 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001017 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001018 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1019 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001020
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001021 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1022 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001023
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001024 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1025 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1026 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1027
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001028 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1029 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1030 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001031
1032 execute_window_loop(window, [&](const Coordinates & id)
1033 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001034 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1035 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1036 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1037 q8x8_t fres = {};
1038 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001039
1040 if(pooling_type == PoolingType::AVG)
1041 {
1042 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001043 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1044 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1045 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001046
1047 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001048 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001049 {
1050 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001051 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1052 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001053 }
1054 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001055 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001056 {
1057 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001058 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1059 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001060 }
1061 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001062 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001063 {
1064 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001065 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1066 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001067 }
1068 };
1069 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001070 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001071 {
1072 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001073 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1074 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001075 }
1076 };
1077 if(pool_stride_x == 2)
1078 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001079 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001080 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001081 wrapper::vgetlane(final_sum.val[0], 0),
1082 wrapper::vgetlane(final_sum.val[0], 2),
1083 wrapper::vgetlane(final_sum.val[0], 4),
1084 wrapper::vgetlane(final_sum.val[0], 6),
1085 wrapper::vgetlane(final_sum.val[1], 0),
1086 wrapper::vgetlane(final_sum.val[1], 2),
1087 wrapper::vgetlane(final_sum.val[1], 4),
1088 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001089 };
1090
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001091 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1092 pool_size, upper_bound_w, upper_bound_h,
1093 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1094 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001095 }
1096 else
1097 {
1098 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001099 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1100 pool_size, upper_bound_w, upper_bound_h,
1101 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001102 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001103 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1104 pool_size, upper_bound_w, upper_bound_h,
1105 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1106 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001107 }
1108 }
1109 else
1110 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001111 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1112 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1113 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1114 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001115
1116 if(pool_stride_x == 2)
1117 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001118 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1119 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1120 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001121 }
1122 else
1123 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001124 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001125 }
1126 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001127
1128 // Store result
1129 if(pool_stride_x == 1)
1130 {
1131 if(input_qinfo != output_qinfo)
1132 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001133 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001134 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001135 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001136 }
1137 else
1138 {
1139 if(input_qinfo != output_qinfo)
1140 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001141 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001142 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001143 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001144 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001145 },
1146 input, output);
1147}
1148
Pablo Tello77e6c552018-12-04 15:33:49 +00001149void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001150{
Pablo Tello77e6c552018-12-04 15:33:49 +00001151 ARM_COMPUTE_UNUSED(pooling_type);
1152 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001153#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1154 Iterator input(_input, window_input);
1155 Iterator output(_output, window);
1156
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001157 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1158 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1159 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1160 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1161 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1162 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001163 int pool_stride_x = 0;
1164 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001165 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001166 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1167 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1168
1169 execute_window_loop(window, [&](const Coordinates & id)
1170 {
1171 float16_t res = 0.0f;
1172 float16x8_t vres = vdupq_n_f16(0.0f);
1173
1174 if(pooling_type != PoolingType::MAX)
1175 {
1176 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001177 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001178
1179 // Perform pooling
1180
1181 for(int y = 0; y < pool_size_y; ++y)
1182 {
1183 int x = 0;
1184 for(; x <= (pool_size_x - 8); x += 8)
1185 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001186 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1187 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001188
1189 // Get power of 2 in case of l2 pooling and accumulate
1190 if(pooling_type == PoolingType::L2)
1191 {
1192 vres = vaddq_f16(vres, vmulq_f16(data, data));
1193 }
1194 else
1195 {
1196 vres = vaddq_f16(vres, data);
1197 }
1198 }
1199
1200 // Leftover for loop
1201 for(; x < pool_size_x; ++x)
1202 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001203 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1204 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001205
1206 // Get power of 2 in case of l2 pooling
1207 if(pooling_type == PoolingType::L2)
1208 {
1209 data *= data;
1210 }
1211
1212 res += data;
1213 }
1214 }
1215
1216 // Reduction
1217 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1218 res += vget_lane_f16(tmp, 0);
1219 res += vget_lane_f16(tmp, 1);
1220 res += vget_lane_f16(tmp, 2);
1221 res += vget_lane_f16(tmp, 3);
1222
1223 // Divide by scale
1224 res *= scale;
1225 }
1226 else
1227 {
1228 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1229 res = std::numeric_limits<float>::lowest();
1230
1231 for(int y = 0; y < pool_size_y; ++y)
1232 {
1233 int x = 0;
1234 for(; x <= (pool_size_x - 8); x += 8)
1235 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001236 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1237 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001238 vres = vmaxq_f16(vres, data);
1239 }
1240
1241 // Leftover for loop
1242 for(; x < pool_size_x; ++x)
1243 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001244 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1245 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1246 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001247 }
1248 }
1249
1250 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1251 res = std::max(res, vget_lane_f16(tmp, 0));
1252 res = std::max(res, vget_lane_f16(tmp, 1));
1253 res = std::max(res, vget_lane_f16(tmp, 2));
1254 res = std::max(res, vget_lane_f16(tmp, 3));
1255 }
1256
1257 // Calculate square-root in case of l2 pooling
1258 if(pooling_type == PoolingType::L2)
1259 {
1260 res = std::sqrt(res);
1261 }
1262
1263 // Store result
1264 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1265 },
1266 input, output);
1267
1268#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1269 ARM_COMPUTE_UNUSED(window_input);
1270 ARM_COMPUTE_UNUSED(window);
1271 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1272#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1273}
1274
Sheri Zhange0681992020-07-14 15:29:28 +01001275#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1276void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1277{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001278 const int window_start_x = window.x().start();
1279 const int window_end_x = window.x().end();
1280 const int window_step_x = 8;
1281
1282 Window window_out = window;
1283 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1284
Sheri Zhange0681992020-07-14 15:29:28 +01001285 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001286 Iterator output(_output, window_out);
1287 Iterator indices(_indices, window_out);
Sheri Zhange0681992020-07-14 15:29:28 +01001288
1289 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1290 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1291
1292 int pool_stride_x = 0;
1293 int pool_stride_y = 0;
1294 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1295
1296 const int pad_right = _input->info()->padding().right;
1297 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1298 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1299
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001300 execute_window_loop(window_out, [&](const Coordinates & id)
Sheri Zhange0681992020-07-14 15:29:28 +01001301 {
1302 const int idx_width = id.y() * pool_stride_x;
1303 const int idx_height = id.z() * pool_stride_y;
1304 const int pool_limit_y = pool_pad_top - idx_height;
1305 const int pool_limit_x = pool_pad_left - idx_width;
1306
1307 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1308 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1309 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1310 (_input->info()->strides_in_bytes().z());
1311 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1312 (_input->info()->strides_in_bytes().z());
Sheri Zhange0681992020-07-14 15:29:28 +01001313 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1314 (_input->info()->strides_in_bytes().z());
Sheri Zhange0681992020-07-14 15:29:28 +01001315 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1316 (_input->info()->strides_in_bytes().z());
1317
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001318 int x_off = window_start_x;
1319 for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
1320 {
1321 const auto in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
1322 const auto in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
1323 const auto in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
1324 const auto in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
1325 const auto v_x0 = vld1q_f16(in_x0_ptr);
1326 const auto v_x1 = vld1q_f16(in_x1_ptr);
1327 const auto v_x2 = vld1q_f16(in_x2_ptr);
1328 const auto v_x3 = vld1q_f16(in_x3_ptr);
1329 float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
1330 // Store result
1331 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
Sheri Zhange0681992020-07-14 15:29:28 +01001332
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001333 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1334 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1335 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1336 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1337 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1338 const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1339 const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
1340 const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
1341 const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1342 const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
1343 const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
1344 const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1345 const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
1346 const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
1347 const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1348 const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
1349 const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
1350 const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
1351 const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
1352 const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1353 const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
1354 const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
1355 // Store indicies
1356 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
1357 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
1358 }
1359
1360 // Left-overs loop
1361 for(; x_off < window_end_x; ++x_off)
1362 {
1363 const auto x0 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
1364 const auto x1 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
1365 const auto x2 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
1366 const auto x3 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
1367 float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
1368
1369 // Store result
1370 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1371
1372 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1373 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1374 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1375 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1376 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1377 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
1378 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
1379 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
1380
1381 // Store indices
1382 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
1383 }
Sheri Zhange0681992020-07-14 15:29:28 +01001384 },
1385 input, output, indices);
1386}
1387#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1388
Pablo Tello77e6c552018-12-04 15:33:49 +00001389void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001390{
Pablo Tello77e6c552018-12-04 15:33:49 +00001391 ARM_COMPUTE_UNUSED(pooling_type);
1392 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001393#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +01001394 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1395 {
1396 pooling2_f16_nhwc_maxpool_indices(window_input, window);
1397 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001398 const int window_start_x = window.x().start();
1399 const int window_end_x = window.x().end();
1400 const int window_step_x = 8;
1401
1402 Window window_out = window;
1403 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1404
Michalis Spyrou57dac842018-03-01 16:03:50 +00001405 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001406 Iterator output(_output, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001407
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001408 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1409 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1410 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1411 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1412 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1413 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001414 int pool_stride_x = 0;
1415 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001416 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001417 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1418 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1419
1420 float16x8_t vres;
1421
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001422 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001423 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001424 const int idx_width = id.y() * pool_stride_x;
1425 const int idx_height = id.z() * pool_stride_y;
1426 const int pool_limit_y = pool_pad_top - idx_height;
1427 const int pool_limit_x = pool_pad_left - idx_width;
1428
1429 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1430 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1431 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1432 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1433
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001434 int x_off = window_start_x;
1435 for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001436 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001437 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001438 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001439 // Calculate scale
1440 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1441 pool_stride_y);
1442 const float16x8_t scale_v = vdupq_n_f16(scale);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001443
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001444 // Perform pooling
1445 vres = vdupq_n_f16(0.0f);
1446 for(int y = pool_start_y; y < pool_end_y; ++y)
1447 {
1448 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001449 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001450 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1451 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1452
1453 // Get power of 2 in case of l2 pooling and accumulate
1454 if(pooling_type == PoolingType::L2)
1455 {
1456 vres = vaddq_f16(vres, vmulq_f16(data, data));
1457 }
1458 else
1459 {
1460 vres = vaddq_f16(vres, data);
1461 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001462 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001463 }
1464 // Divide by scale
1465 vres = vmulq_f16(vres, scale_v);
1466 }
1467 else
1468 {
1469 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1470
1471 for(int y = pool_start_y; y < pool_end_y; ++y)
1472 {
1473 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001474 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001475 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1476 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1477 vres = vmaxq_f16(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001478 }
1479 }
1480 }
Michalis Spyrouced25572018-10-01 16:26:20 +01001481
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001482 // Calculate square-root in case of l2 pooling
1483 if(pooling_type == PoolingType::L2)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001484 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001485 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1486 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1487 }
1488
1489 // Store result
1490 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1491 }
1492
1493 // Left-overs loop
1494 for(; x_off < window_end_x; ++x_off)
1495 {
1496 float16_t res = 0.0f;
1497
1498 if(pooling_type != PoolingType::MAX)
1499 {
1500 // Calculate scale
1501 const float16_t scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1502 pool_stride_y);
1503
1504 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001505 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001506 for(int x = pool_start_x; x < pool_end_x; ++x)
1507 {
1508 const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1509 (_input->info()->strides_in_bytes().z())) + x_off);
1510
1511 // Get power of 2 in case of l2 pooling and accumulate
1512 if(pooling_type == PoolingType::L2)
1513 {
1514 res += data * data;
1515 }
1516 else
1517 {
1518 res += data;
1519 }
1520 }
1521 }
1522
1523 // Divide by scale
1524 res *= scale;
1525 }
1526 else
1527 {
1528 res = std::numeric_limits<float>::lowest();
1529 for(int y = pool_start_y; y < pool_end_y; ++y)
1530 {
1531 for(int x = pool_start_x; x < pool_end_x; ++x)
1532 {
1533 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1534 (_input->info()->strides_in_bytes().z())) + x_off);
1535 res = std::max(res, data);
1536 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001537 }
1538 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001539
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001540 // Calculate square-root in case of l2 pooling
1541 if(pooling_type == PoolingType::L2)
1542 {
1543 res = std::sqrt(res);
1544 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001545
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001546 // Store result
1547 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1548 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001549 },
1550 input, output);
1551
1552#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1553 ARM_COMPUTE_UNUSED(window_input);
1554 ARM_COMPUTE_UNUSED(window);
1555 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1556#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1557}
1558
Pablo Tello77e6c552018-12-04 15:33:49 +00001559void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001560{
1561 Iterator input(_input, window_input);
1562 Iterator output(_output, window);
1563
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001564 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1565 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1566 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1567 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1568 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1569 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001570 int pool_stride_x = 0;
1571 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001572 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001573 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1574 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001575
1576 execute_window_loop(window, [&](const Coordinates & id)
1577 {
1578 float res = 0.0f;
1579
1580 if(pooling_type != PoolingType::MAX)
1581 {
1582 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001583 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001584
1585 // Perform pooling
1586 float32x4_t vres = vdupq_n_f32(0.0f);
1587
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001588 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001589 {
1590 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001591 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001592 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001593 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1594 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001595
1596 // Get power of 2 in case of l2 pooling and accumulate
1597 if(pooling_type == PoolingType::L2)
1598 {
1599 vres = vmlaq_f32(vres, data, data);
1600 }
1601 else
1602 {
1603 vres = vaddq_f32(vres, data);
1604 }
1605 }
1606
1607 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001608 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001609 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001610 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1611 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001612
1613 // Get power of 2 in case of l2 pooling
1614 if(pooling_type == PoolingType::L2)
1615 {
1616 data *= data;
1617 }
1618
1619 res += data;
1620 }
1621 }
1622
1623#if defined(__aarch64__)
1624 // Reduction operation available on 64 bit architectures only
1625 res += vaddvq_f32(vres);
1626#else // __aarch64__
1627 // Reduction
1628 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1629 tmp = vpadd_f32(tmp, tmp);
1630
1631 res += vget_lane_f32(tmp, 0);
1632#endif // __aarch64__
1633 // Divide by scale
1634 res *= scale;
1635 }
1636 else
1637 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001638 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1639 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001640
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001641 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001642 {
1643 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001644 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001645 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001646 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1647 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001648 vres = vmaxq_f32(vres, data);
1649 }
1650
1651 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001652 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001653 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001654 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1655 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001656 res = std::max(res, data);
1657 }
1658 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001659#if defined(__aarch64__)
1660 // Reduction operation available on 64 bit architectures only
1661 res = std::max(vmaxvq_f32(vres), res);
1662#else // __aarch64__
1663 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1664 tmp = vpmax_f32(tmp, tmp);
1665
1666 res = std::max(res, vget_lane_f32(tmp, 0));
1667#endif // __aarch64__
1668 }
1669
1670 // Calculate square-root in case of l2 pooling
1671 if(pooling_type == PoolingType::L2)
1672 {
1673 res = std::sqrt(res);
1674 }
1675
1676 // Store result
1677 *(reinterpret_cast<float *>(output.ptr())) = res;
1678 },
1679 input, output);
1680}
1681
morgolockcc1f6c92020-03-24 09:26:48 +00001682void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1683 bool exclude_padding)
1684{
1685 if(pooling_type == PoolingType::MAX && _indices)
1686 {
Sheri Zhange0681992020-07-14 15:29:28 +01001687 pooling2_nchw_maxpool_indices<float>(window_input, window);
morgolockcc1f6c92020-03-24 09:26:48 +00001688 }
1689 else
1690 {
1691 Iterator input(_input, window_input);
1692 Iterator output(_output, window);
1693 constexpr int pool_size = 2;
1694 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1695 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1696 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1697 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1698 int pool_stride_x = 0;
1699 int pool_stride_y = 0;
1700 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1701 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1702 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1703
1704 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1705 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1706
1707 execute_window_loop(window, [&](const Coordinates & id)
1708 {
1709 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1710 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1711 float32x2_t top_data = vld1_f32(in_top_ptr);
1712 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1713 float32x2_t res = {};
1714 float final_res = 0;
1715 // Get power of 2 in case of l2 pooling
1716 if(pooling_type == PoolingType::L2)
1717 {
1718 top_data = vmul_f32(top_data, top_data);
1719 bottom_data = vmul_f32(bottom_data, bottom_data);
1720 }
1721
1722 if(pooling_type != PoolingType::MAX)
1723 {
1724 // Calculate scale
1725 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1726 const float32x2_t scale_v = vdup_n_f32(scale);
1727
1728 // Perform pooling
1729 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1730 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1731 }
1732 else
1733 {
1734 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1735 res = vpmax_f32(max_data, max_data);
1736 }
1737 final_res = vget_lane_f32(res, 0);
1738
1739 // Calculate square-root in case of l2 pooling
1740 if(pooling_type == PoolingType::L2)
1741 {
1742 final_res = sqrt(final_res);
1743 }
1744
1745 // Store result
1746 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1747 },
1748 input, output);
1749 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001750}
1751
1752void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1753{
1754 Iterator input(_input, window_input);
1755 Iterator output(_output, window);
1756
1757 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001758 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1759 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1760 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1761 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001762 int pool_stride_x = 0;
1763 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001764 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001765 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1766 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1767
1768 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1769 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1770 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1771
1772 execute_window_loop(window, [&](const Coordinates & id)
1773 {
1774 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1775 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1776 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1777 float32x2_t res = {};
1778 float final_res = 0;
1779
1780 // Get power of 2 in case of l2 pooling
1781 if(pooling_type == PoolingType::L2)
1782 {
1783 top_data = vmulq_f32(top_data, top_data);
1784 middle_data = vmulq_f32(middle_data, middle_data);
1785 bottom_data = vmulq_f32(bottom_data, bottom_data);
1786 }
1787
1788 if(pooling_type != PoolingType::MAX)
1789 {
1790 // Calculate scale
1791 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1792 const float32x2_t scale_v = vdup_n_f32(scale);
1793
1794 // Perform pooling
1795 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1796 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1797 res = vmul_f32(vpadd_f32(res, res), scale_v);
1798 }
1799 else
1800 {
1801 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1802 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1803 res = vpmax_f32(res, res);
1804 }
1805 final_res = vget_lane_f32(res, 0);
1806
1807 // Calculate square-root in case of l2 pooling
1808 if(pooling_type == PoolingType::L2)
1809 {
1810 final_res = sqrt(final_res);
1811 }
1812
1813 // Store result
1814 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1815 },
1816 input, output);
1817}
1818
1819void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1820{
1821 Iterator input(_input, window_input);
1822 Iterator output(_output, window);
1823
1824 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001825 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1826 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1827 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1828 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001829 int pool_stride_x = 0;
1830 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001831 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001832 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1833 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1834
1835 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1836 for(int i = 0; i < pool_size; ++i)
1837 {
1838 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1839 }
1840
1841 execute_window_loop(window, [&](const Coordinates & id)
1842 {
1843 float32x2_t res = {};
1844 float final_res = 0.f;
1845 if(pooling_type != PoolingType::MAX)
1846 {
1847 // Calculate scale
1848 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1849 const float32x2_t scale_v = vdup_n_f32(scale);
1850
1851 // Perform pooling
1852 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1853 // Get power of 2 in case of l2 pooling
1854 if(pooling_type == PoolingType::L2)
1855 {
1856 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1857 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1858 }
1859 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1860 for(int i = 1; i < pool_size; ++i)
1861 {
1862 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1863 // Get power of 2 in case of l2 pooling
1864 if(pooling_type == PoolingType::L2)
1865 {
1866 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1867 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1868 }
1869 sum_data = vaddq_f32(sum_data, data.val[0]);
1870 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1871 }
1872 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1873 res = vmul_f32(vpadd_f32(res, res), scale_v);
1874 }
1875 else
1876 {
1877 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1878 for(int i = 1; i < pool_size; ++i)
1879 {
1880 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1881 max_data = vmax2q_f32(max_data, data);
1882 }
1883 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1884 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1885 res = vpmax_f32(res, res);
1886 }
1887 final_res = vget_lane_f32(res, 0);
1888
1889 // Calculate square-root in case of l2 pooling
1890 if(pooling_type == PoolingType::L2)
1891 {
1892 final_res = sqrt(final_res);
1893 }
1894
1895 // Store result
1896 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1897 },
1898 input, output);
1899}
1900
1901void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001902{
morgolocke383c352020-04-03 16:57:46 +01001903 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1904 {
1905 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1906 }
1907 else
1908 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001909 const int window_start_x = window.x().start();
1910 const int window_end_x = window.x().end();
1911 const int window_step_x = 4;
1912
1913 Window window_out = window;
1914 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1915
morgolocke383c352020-04-03 16:57:46 +01001916 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001917 Iterator output(_output, window_out);
morgolocke383c352020-04-03 16:57:46 +01001918
1919 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1920 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1921 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1922 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1923 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1924 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1925 int pool_stride_x = 0;
1926 int pool_stride_y = 0;
1927 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1928 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1929 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1930
1931 float32x4_t vres;
1932
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001933 execute_window_loop(window_out, [&](const Coordinates & id)
morgolocke383c352020-04-03 16:57:46 +01001934 {
1935 const int idx_width = id.y() * pool_stride_x;
1936 const int idx_height = id.z() * pool_stride_y;
1937 const int pool_limit_y = pool_pad_top - idx_height;
1938 const int pool_limit_x = pool_pad_left - idx_width;
1939
1940 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1941 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1942 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1943 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1944
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001945 int x_off = window_start_x;
1946 for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
morgolocke383c352020-04-03 16:57:46 +01001947 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001948 if(pooling_type != PoolingType::MAX)
morgolocke383c352020-04-03 16:57:46 +01001949 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001950 // Calculate scale
1951 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1952 pool_stride_y);
1953 const float32x4_t scale_v = vdupq_n_f32(scale);
morgolocke383c352020-04-03 16:57:46 +01001954
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001955 // Perform pooling
1956 vres = vdupq_n_f32(0.0f);
1957
1958 for(int y = pool_start_y; y < pool_end_y; ++y)
1959 {
1960 for(int x = pool_start_x; x < pool_end_x; ++x)
morgolocke383c352020-04-03 16:57:46 +01001961 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001962 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1963 (_input->info()->strides_in_bytes().z())) + x_off);
1964
1965 // Get power of 2 in case of l2 pooling and accumulate
1966 if(pooling_type == PoolingType::L2)
1967 {
1968 vres = vmlaq_f32(vres, data, data);
1969 }
1970 else
1971 {
1972 vres = vaddq_f32(vres, data);
1973 }
morgolocke383c352020-04-03 16:57:46 +01001974 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001975 }
1976 // Divide by scale
1977 vres = vmulq_f32(vres, scale_v);
1978 }
1979 else
1980 {
1981 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1982 for(int y = pool_start_y; y < pool_end_y; ++y)
1983 {
1984 for(int x = pool_start_x; x < pool_end_x; ++x)
morgolocke383c352020-04-03 16:57:46 +01001985 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001986 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1987 (_input->info()->strides_in_bytes().z())) + x_off);
1988 vres = vmaxq_f32(vres, data);
morgolocke383c352020-04-03 16:57:46 +01001989 }
1990 }
1991 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001992
1993 // Calculate square-root in case of l2 pooling
1994 if(pooling_type == PoolingType::L2)
morgolocke383c352020-04-03 16:57:46 +01001995 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001996 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1997 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
1998 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
1999 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
2000 };
2001 vres = l2_res;
2002 }
2003
2004 // Store result
2005 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2006 }
2007
2008 // Left-overs loop
2009 for(; x_off < window_end_x; ++x_off)
2010 {
2011 float res = 0.0f;
2012
2013 if(pooling_type != PoolingType::MAX)
2014 {
2015 // Calculate scale
2016 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2017 pool_stride_y);
2018
2019 for(int y = pool_start_y; y < pool_end_y; ++y)
morgolocke383c352020-04-03 16:57:46 +01002020 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002021 for(int x = pool_start_x; x < pool_end_x; ++x)
2022 {
2023 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2024 (_input->info()->strides_in_bytes().z())) + x_off);
2025
2026 // Get power of 2 in case of l2 pooling and accumulate
2027 if(pooling_type == PoolingType::L2)
2028 {
2029 res += data * data;
2030 }
2031 else
2032 {
2033 res += data;
2034 }
2035 }
2036 }
2037
2038 // Divide by scale
2039 res *= scale;
2040 }
2041 else
2042 {
2043 res = std::numeric_limits<float>::lowest();
2044 for(int y = pool_start_y; y < pool_end_y; ++y)
2045 {
2046 for(int x = pool_start_x; x < pool_end_x; ++x)
2047 {
2048 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2049 (_input->info()->strides_in_bytes().z())) + x_off);
2050 res = std::max(res, data);
2051 }
morgolocke383c352020-04-03 16:57:46 +01002052 }
2053 }
morgolocke383c352020-04-03 16:57:46 +01002054
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002055 // Calculate square-root in case of l2 pooling
2056 if(pooling_type == PoolingType::L2)
2057 {
2058 res = std::sqrt(res);
2059 }
morgolocke383c352020-04-03 16:57:46 +01002060
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002061 // Store result
2062 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2063 }
morgolocke383c352020-04-03 16:57:46 +01002064 },
2065 input, output);
2066 }
2067}
2068
2069void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
2070{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002071 const int window_start_x = window.x().start();
2072 const int window_end_x = window.x().end();
2073 const int window_step_x = 4;
2074
2075 Window window_out = window;
2076 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2077
Michalis Spyrou57dac842018-03-01 16:03:50 +00002078 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002079 Iterator output(_output, window_out);
2080 Iterator indices(_indices, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002081
morgolocke383c352020-04-03 16:57:46 +01002082 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2083 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2084
2085 int pool_stride_x = 0;
2086 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002087 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002088
2089 float32x4_t vres;
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002090 float res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002091
morgolocke383c352020-04-03 16:57:46 +01002092 const int pad_right = _input->info()->padding().right;
morgolocke383c352020-04-03 16:57:46 +01002093 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
2094 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002095
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002096 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002097 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002098 const int idx_width = id.y() * pool_stride_x;
2099 const int idx_height = id.z() * pool_stride_y;
2100 const int pool_limit_y = pool_pad_top - idx_height;
2101 const int pool_limit_x = pool_pad_left - idx_width;
2102
2103 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
Michalis Spyrouced25572018-10-01 16:26:20 +01002104 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002105
morgolocke383c352020-04-03 16:57:46 +01002106 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2107 (_input->info()->strides_in_bytes().z());
2108 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2109 (_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002110 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2111 (_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002112 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2113 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00002114
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002115 int x_off = window_start_x;
2116 for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
2117 {
2118 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
2119 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
2120 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
2121 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
2122 const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
2123 const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
2124 const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
2125 const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
2126 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
2127 // Store result
2128 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
morgolocke383c352020-04-03 16:57:46 +01002129
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002130 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2131 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2132 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2133 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2134 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2135 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
2136 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
2137 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
2138 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
2139 const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
2140 const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
2141 const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
2142
2143 // Store indices
2144 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
2145 }
2146
2147 // Left-overs loop
2148 for(; x_off < window_end_x; ++x_off)
2149 {
2150 const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
2151 const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
2152 const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
2153 const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
2154 res = std::max(std::max(x2, x3), std::max(x0, x1));
2155
2156 // Store result
2157 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2158
2159 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2160 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2161 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2162 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2163 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2164 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
2165 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
2166 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
2167
2168 // Store indices
2169 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
2170 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002171 },
morgolocke383c352020-04-03 16:57:46 +01002172 input, output, indices);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002173}
2174
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002175template <typename T>
2176void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00002177{
2178 Iterator input(_input, window_input);
2179 Iterator output(_output, window);
2180
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002181 /** NEON vector types */
2182 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2183 using q16_t = typename wrapper::traits::promote_t<T>;
2184 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2185 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2186 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2187
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002188 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
2189 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
2190 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2191 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2192 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2193 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002194 int pool_stride_x = 0;
2195 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002196 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002197 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
2198 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00002199
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002200 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
2201 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
2202
Georgios Pinitas55186712018-01-08 17:37:12 +00002203 execute_window_loop(window, [&](const Coordinates & id)
2204 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002205 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00002206
2207 if(pooling_type != PoolingType::MAX)
2208 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002209 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2210 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00002211
2212 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002213 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00002214
2215 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002216 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002217 {
2218 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002219 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002220 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002221 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2222 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002223
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002224 const q16x8_t data_q16 = wrapper::vmovl(data);
2225 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00002226 }
2227
2228 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002229 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002230 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002231 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2232 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002233 sres += data;
2234 }
2235 }
2236
2237 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002238 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2239 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00002240
2241 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002242 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00002243 }
2244 else
2245 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002246 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00002247
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002248 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002249 {
2250 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002251 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002252 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002253 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2254 (_input->info()->strides_in_bytes().y())));
2255 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002256 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002257 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002258 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002259 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002260 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2261 (_input->info()->strides_in_bytes().y())));
2262 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002263 }
2264 }
2265
2266 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002267 vres = wrapper::vpmax(vres, vres);
2268 vres = wrapper::vpmax(vres, vres);
2269 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00002270
2271 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002272 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00002273 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002274 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002275 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2276 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00002277 },
2278 input, output);
2279}
2280
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002281template <typename T>
2282void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002283{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002284 const int window_start_x = window.x().start();
2285 const int window_end_x = window.x().end();
2286 const int window_step_x = 16;
2287
2288 Window window_out = window;
2289 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2290
Michalis Spyrou57dac842018-03-01 16:03:50 +00002291 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002292 Iterator output(_output, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002293
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002294 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2295 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2296 using q16_t = typename wrapper::traits::promote_t<T>;
2297 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2298 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2299 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2300
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002301 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2302 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2303 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2304 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2305 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2306 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002307
2308 int pool_stride_x = 0;
2309 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002310 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002311 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2312 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2313
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002314 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2315 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2316 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00002317
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002318 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002319 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2320 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002321 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002322
2323 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2324 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2325 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2326
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002327 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002328 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002329 const int idx_width = id.y() * pool_stride_x;
2330 const int idx_height = id.z() * pool_stride_y;
2331 const int pool_limit_y = pool_pad_top - idx_height;
2332 const int pool_limit_x = pool_pad_left - idx_width;
2333
2334 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2335 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2336 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2337 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2338
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002339 int x_off = window_start_x;
2340 for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002341 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002342 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002343 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002344 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2345 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2346 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2347 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002348
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002349 // Calculate scale
2350 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2351 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002352
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002353 // Perform pooling
2354 for(int y = pool_start_y; y < pool_end_y; ++y)
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002355 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002356 for(int x = pool_start_x; x < pool_end_x; ++x)
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002357 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002358 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2359 (_input->info()->strides_in_bytes().z())) + x_off);
2360
2361 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2362 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2363 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2364 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2365 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2366 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002367 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002368 }
2369
2370 if(input_qinfo != output_qinfo)
2371 {
2372 const float32x4x4_t vres =
2373 {
2374 {
2375 vcvtq_f32_q32(vres1),
2376 vcvtq_f32_q32(vres2),
2377 vcvtq_f32_q32(vres3),
2378 vcvtq_f32_q32(vres4),
2379 }
2380 };
2381 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2382 // Store result
2383 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
2384 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
2385 }
2386 else
2387 {
2388 const float32x4_t scale_v = vdupq_n_f32(scale);
2389 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2390 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2391 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2392 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2393 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
2394
2395 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2396 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2397 // Store result
2398 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
2399 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
2400 }
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002401 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002402 else
2403 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002404 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002405
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002406 for(int y = pool_start_y; y < pool_end_y; ++y)
2407 {
2408 for(int x = pool_start_x; x < pool_end_x; ++x)
2409 {
2410 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2411 (_input->info()->strides_in_bytes().z())) + x_off);
2412 vres = wrapper::vmax(vres, data);
2413 }
2414 }
2415
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002416 // Store result
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002417 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
Georgios Pinitasddb93bb2020-10-02 16:38:59 +01002418 requant_qinfo) :
2419 vres);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002420 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002421 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002422
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002423 // Left-overs loop
2424 for(; x_off < window_end_x; ++x_off)
2425 {
2426 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002427 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002428 q32_t res = static_cast<q32_t>(0.f);
2429
2430 // Calculate scale
2431 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2432 pool_stride_y);
2433
2434 // Perform pooling
2435 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002436 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002437 for(int x = pool_start_x; x < pool_end_x; ++x)
2438 {
2439 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2440 (_input->info()->strides_in_bytes().z())) + x_off);
2441 res += data;
2442 }
2443 }
2444
2445 if(input_qinfo != output_qinfo)
2446 {
2447 const float res_f = static_cast<float>(res);
2448 const float new_scale = quant_rescale / scale;
2449 const auto requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
2450
2451 // Store result
2452 *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
2453 }
2454 else
2455 {
2456 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2457 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
2458
2459 // Store result
2460 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002461 }
2462 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002463 else
2464 {
2465 T res = std::numeric_limits<T>::min();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002466
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002467 for(int y = pool_start_y; y < pool_end_y; ++y)
2468 {
2469 for(int x = pool_start_x; x < pool_end_x; ++x)
2470 {
2471 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2472 (_input->info()->strides_in_bytes().z())) + x_off);
2473 res = std::max(res, data);
2474 }
2475 }
2476
2477 // Store result
2478 if(input_qinfo != output_qinfo)
2479 {
2480 const float res_f = static_cast<float>(res);
2481 *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
2482 }
2483 else
2484 {
2485 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2486 }
2487 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002488 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002489
Michalis Spyrou57dac842018-03-01 16:03:50 +00002490 },
2491 input, output);
2492}
2493
morgolockcc1f6c92020-03-24 09:26:48 +00002494Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002495{
2496 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2497
2498 unsigned int pooled_w = 0;
2499 unsigned int pooled_h = 0;
2500 unsigned int num_elems_processed_per_iteration = 0;
2501 BorderSize border_size(0);
2502
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002503 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002504 unsigned int pool_size_x = 0;
2505 unsigned int pool_size_y = 0;
2506
2507 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002508 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2509 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2510 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002511
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002512 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2513 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002514
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002515 // Validate pool info before calling scaled_dimensions
2516 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002517
2518 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002519 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2520 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002521 pool_size_x,
2522 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002523 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002524
morgolockcc1f6c92020-03-24 09:26:48 +00002525 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2526 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2527 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002528 pool_size_x, pool_size_y)
2529 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002530
2531 return Status{};
2532}
2533
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002534void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002535{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002536 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002537 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2538 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2539 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2540
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002541 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2542 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2543 const unsigned int pool_size = _pool_info.pool_size.width;
2544 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002545
Michalis Spyrou57dac842018-03-01 16:03:50 +00002546 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002547 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002548 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002549 // Set step for input in x and y direction for the input
2550 unsigned int window_x_inc = 0;
2551 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002552 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002553 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002554 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002555 {
2556 window_x_inc = pool_stride_x;
2557 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2558 {
2559 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2560 }
2561 break;
2562 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002563
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002564 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002565 case DataType::F32:
2566 {
2567 window_x_inc = pool_stride_x;
2568 break;
2569 }
2570 default:
2571 {
2572 ARM_COMPUTE_ERROR("Not supported");
2573 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002574 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002575 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2576 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002577 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002578 else
2579 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002580 window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002581 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2582 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2583 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002584
2585 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002586 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002587}
morgolockcc1f6c92020-03-24 09:26:48 +00002588} // namespace arm_compute