blob: 0f0b9eed5abdf20d4c9a4bc8ca5e9b0092651499 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Utils.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010033#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010034#include "src/core/AccessWindowStatic.h"
35#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010036#include "src/core/NEON/NEAsymm.h"
37#include "src/core/NEON/NEFixedPoint.h"
38#include "src/core/NEON/NEMath.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010039#include "src/core/helpers/AutoConfiguration.h"
40#include "src/core/helpers/WindowHelpers.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000041#include "support/ToolchainSupport.h"
42
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010043#include "src/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044#include <algorithm>
45#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010046#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010048#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010049#include <string>
50#include <tuple>
51
Manuel Bottinib4bb8272019-12-18 18:01:27 +000052namespace arm_compute
53{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010054using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055
56namespace
57{
Michalis Spyroucffb2a32020-09-08 16:26:38 +010058template <typename T>
59inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
60quantize(float val, const UniformQuantizationInfo &info)
61{
62 return quantize_qasymm8_signed(val, info);
63}
64
65template <typename T>
66inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
67quantize(float val, const UniformQuantizationInfo &info)
68{
69 return quantize_qasymm8(val, info);
70}
71
Pablo Tello77e6c552018-12-04 15:33:49 +000072inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
74{
Michalis Spyrou57dac842018-03-01 16:03:50 +000075 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
76 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
77
78 int start_x = id[idx_width] * stride_x - pad_x;
79 int start_y = id[idx_height] * stride_y - pad_y;
80
81 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
82 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000083 if(exclude_padding)
84 {
85 start_x = std::max(0, start_x);
86 start_y = std::max(0, start_y);
87 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010088 return 1.f / ((end_y - start_y) * (end_x - start_x));
89}
90
Manuel Bottinib4bb8272019-12-18 18:01:27 +000091template <typename T, typename TVec>
92inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000093 const int pool_size, const int upper_bound_w, const int upper_bound_h,
94 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
95{
96 int start_x = (id.x() + id_offset) * stride_x - pad_x;
97 int start_y = id.y() * stride_y - pad_y;
98 const int end_y = std::min(start_y + pool_size, upper_bound_h);
99 if(exclude_padding)
100 {
101 start_y = std::max(0, start_y);
102 }
103
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000104 std::array<T, 8> elems =
Georgios Pinitas55186712018-01-08 17:37:12 +0000105 {
106 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000107 wrapper::vgetlane(v, 0),
108 wrapper::vgetlane(v, 1),
109 wrapper::vgetlane(v, 2),
110 wrapper::vgetlane(v, 3),
111 wrapper::vgetlane(v, 4),
112 wrapper::vgetlane(v, 5),
113 wrapper::vgetlane(v, 6),
114 wrapper::vgetlane(v, 7),
Georgios Pinitas55186712018-01-08 17:37:12 +0000115 }
116 };
117
118 for(auto &el : elems)
119 {
120 int c_start_x = start_x;
121 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
122 if(exclude_padding)
123 {
124 c_start_x = std::max(0, c_start_x);
125 }
126 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
127 el *= scale;
128 start_x += step * stride_x;
129 }
130
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000131 v = wrapper::vsetlane(elems[0], v, 0);
132 v = wrapper::vsetlane(elems[1], v, 1);
133 v = wrapper::vsetlane(elems[2], v, 2);
134 v = wrapper::vsetlane(elems[3], v, 3);
135 v = wrapper::vsetlane(elems[4], v, 4);
136 v = wrapper::vsetlane(elems[5], v, 5);
137 v = wrapper::vsetlane(elems[6], v, 6);
138 v = wrapper::vsetlane(elems[7], v, 7);
Georgios Pinitas55186712018-01-08 17:37:12 +0000139}
140
morgolockcc1f6c92020-03-24 09:26:48 +0000141Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
142 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000144 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100145
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000146 int pool_stride_x = 0;
147 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000148 PoolingType pool_type = pool_info.pool_type;
149 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100150 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100151
Anthony Barbiereaefd002018-07-20 17:49:35 +0100152 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
morgolockcc1f6c92020-03-24 09:26:48 +0000153 if(indices)
154 {
morgolock37722d92020-04-09 14:17:48 +0100155 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
morgolockcc1f6c92020-03-24 09:26:48 +0000156 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
157 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
158 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000159 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000160 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Michele Di Giorgio2c877192020-02-18 19:06:27 +0000161 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
162 && (input->data_layout() == DataLayout::NHWC),
163 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000164
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000165 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100166 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000167 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000168 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
169 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
170 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
morgolockcc1f6c92020-03-24 09:26:48 +0000171
172 if(indices)
173 {
174 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
morgolockcc1f6c92020-03-24 09:26:48 +0000175 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
176 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
177 }
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100178 }
179
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180 return Status{};
181}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100182
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000183Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000184{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000185 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
186 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000187
188 return Status{};
189}
190
morgolockcc1f6c92020-03-24 09:26:48 +0000191std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
192 unsigned int &num_elems_processed_per_iteration,
193 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000194 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000195{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100196 // Output auto inizialitation if not yet initialized
197 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
morgolockcc1f6c92020-03-24 09:26:48 +0000198 if(indices)
199 {
200 // Indices auto inizialitation if not yet initialized
morgolocke383c352020-04-03 16:57:46 +0100201 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
202 pool_info)))
203 .set_data_type(DataType::U32) /* we store the offset to the element */);
morgolockcc1f6c92020-03-24 09:26:48 +0000204 }
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000205 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000206 unsigned int num_elems_read_per_iteration = 0;
207 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000208 int pool_stride_x = 0;
209 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000210 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
211 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
212 const int input_width = input->dimension(idx_width);
213 const int input_height = input->dimension(idx_height);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000214 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000215 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000216 const int pool_pad_right = pad_stride_info.pad_right();
217 const int pool_pad_top = pad_stride_info.pad_top();
218 const int pool_pad_left = pad_stride_info.pad_left();
219 const int pool_pad_bottom = pad_stride_info.pad_bottom();
220 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000221
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000222 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000223 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
224 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000225 pool_size_x,
226 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000227 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100228
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000229 //If it's not squared and optimized will be executed the MxN
230 num_elems_read_per_iteration = 1;
231 num_elems_processed_per_iteration = 1;
232 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100233
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000234 if(is_square)
235 {
236 switch(input->data_type())
237 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000238 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000239 case DataType::QASYMM8_SIGNED:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000240 switch(pool_size_x)
241 {
242 case 2:
243 num_elems_read_per_iteration = 16;
244 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
245 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
246 break;
247 case 3:
248 num_elems_read_per_iteration = 16;
249 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
250 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
251 break;
252 default:
253 break;
254 }
255 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000256#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
257 case DataType::F16:
258 switch(pool_size_x)
259 {
260 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000261 case 3:
262 num_elems_read_per_iteration = 4;
263 num_elems_processed_per_iteration = 1;
264 num_elems_horizontal_window = 1;
265 break;
266 default:
267 break;
268 }
269 break;
270#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
271 case DataType::F32:
272 switch(pool_size_x)
273 {
274 case 2:
275 num_elems_read_per_iteration = 2;
276 break;
277 case 3:
278 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
279 break;
280 case 7:
281 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
282 break;
283 default:
284 break;
285 }
286 num_elems_processed_per_iteration = 1;
287 num_elems_horizontal_window = 1;
288 break;
289 default:
290 ARM_COMPUTE_ERROR("Element size not supported");
291 break;
292 }
293 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000294
295 bool window_changed = false;
296 Window win{};
297 if(data_layout == DataLayout::NCHW)
298 {
299 // Number of iterations in X dimension
300 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000301 // Upper limit for the number of right/bottom border elements that are accessed
302 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
303 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
morgolockcc1f6c92020-03-24 09:26:48 +0000304 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
305 border_size.right = std::max(upper_bound_w, pool_pad_right);
306 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000307 TensorShape output_shape{ input->tensor_shape() };
308 output_shape.set(0, pooled_w);
309 output_shape.set(1, pooled_h);
310 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
Michalis Spyrou57dac842018-03-01 16:03:50 +0000311 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
morgolockcc1f6c92020-03-24 09:26:48 +0000312 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000313 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
morgolockcc1f6c92020-03-24 09:26:48 +0000314 if(indices)
315 {
316 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
317 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
318 }
319 else
320 {
321 window_changed = update_window_and_padding(win, input_access, output_access);
322 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000323 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
324 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000325
326 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
327 return std::make_pair(err, win);
328}
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000329
330template <typename T>
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000331inline T vcvtq_q32_f32(float32x4_t values);
332
333template <>
334inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
335{
336 return vcvtq_u32_f32(values);
337}
338
339template <>
340inline int32x4_t vcvtq_q32_f32(float32x4_t values)
341{
342 return vcvtq_s32_f32(values);
343}
344
345template <typename T>
346inline float32x4_t vcvtq_f32_q32(T values);
347
348template <>
349inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
350{
351 return vcvtq_f32_u32(values);
352}
353
354template <>
355inline float32x4_t vcvtq_f32_q32(int32x4_t values)
356{
357 return vcvtq_f32_s32(values);
358}
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000359
360template <typename Tout>
361inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
362
363template <>
364inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
365{
366 const float new_scale = quant_rescale / scale_pooling;
367 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
368}
369
370template <>
371inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
372{
373 const float new_scale = quant_rescale / scale_pooling;
374 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
375}
376
377template <typename Tin, typename Tout>
378inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
379
380template <>
381inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
382{
383 const float32x4x4_t acc =
384 {
385 {
386 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
387 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
388 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
389 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
390 }
391 };
392 return vquantize(acc, requant_qinfo);
393}
394
395template <>
396inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
397{
398 const float32x4x4_t acc =
399 {
400 {
401 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
402 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
403 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
404 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
405 }
406 };
407 return vquantize_signed(acc, requant_qinfo);
408}
409
410template <typename T>
411inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
412
413template <>
414inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
415{
416 const float32x4x2_t acc =
417 {
418 {
419 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
420 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
421 }
422 };
423 return vquantize(acc, requant_qinfo);
424}
425
426template <>
427inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
428{
429 const float32x4x2_t acc =
430 {
431 {
432 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
433 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
434 }
435 };
436 return vquantize_signed(acc, requant_qinfo);
437}
438
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000439} // namespace
440
441NEPoolingLayerKernel::NEPoolingLayerKernel()
morgolockcc1f6c92020-03-24 09:26:48 +0000442 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000443{
444}
445
446BorderSize NEPoolingLayerKernel::border_size() const
447{
448 return _border_size;
449}
450
morgolockcc1f6c92020-03-24 09:26:48 +0000451void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000452{
453 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000454 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
455 const bool is_global_pooling = pool_info.is_global_pooling;
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000456 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000457
458 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +0000459 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
460 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
461 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000462
463 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000464 const Size2D pool_size(
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000465 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
466 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000467
468 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000469 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000470
471 // Check output dimensions
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100472 unsigned int pooled_w;
473 unsigned int pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000474 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
475 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000476 pool_size.x(),
477 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000478 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000479
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000480 // Perform validation step
morgolockcc1f6c92020-03-24 09:26:48 +0000481 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100482
483 // Set instance variables
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000484 _input = input;
485 _output = output;
morgolockcc1f6c92020-03-24 09:26:48 +0000486 _indices = indices;
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000487 _pool_info = pool_info;
488 _data_layout = input->info()->data_layout();
489 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100490
Georgios Pinitas55186712018-01-08 17:37:12 +0000491 // Get data type
492 const DataType data_type = input->info()->data_type();
Georgios Pinitas14d9d982019-12-13 12:33:09 +0000493 const bool is_nchw = _data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000494
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100495 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000496 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100497 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000498 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100499 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
500 }
501 else
502 {
503 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100504 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000505 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000506 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100507 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000508 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000509 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000510 }
511 else
512 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000513 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
Pablo Tello77e6c552018-12-04 15:33:49 +0000514 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000515 }
516 }
517 else if(data_type == DataType::QASYMM8_SIGNED)
518 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100519 if(!is_nchw)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000520 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100521 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
522 }
523 else
524 {
525 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000526 {
527 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
528 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100529 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000530 {
531 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
532 }
533 else
534 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000535 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
536 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000537 }
538 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000539 else if(data_type == DataType::F16)
540 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100541 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000542 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100543 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000544 }
545 else
546 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100547 if(_is_square)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000548 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100549 switch(pool_size.x())
550 {
551 case 2:
552 {
553 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
554 }
555 break;
556 case 3:
557 {
558 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
559 }
560 break;
561 default:
562 {
563 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
564 break;
565 }
566 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000567 }
568 else
569 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100570 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000571 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000572 }
573 }
574 else if(data_type == DataType::F32)
575 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100576 if(!is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000577 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100578 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000579 }
580 else
581 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100582 if(_is_square)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000583 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100584 switch(pool_size.x())
585 {
586 case 2:
587 {
588 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
589 break;
590 }
591 case 3:
592 {
593 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
594 break;
595 }
596 case 7:
597 {
598 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
599 break;
600 }
601 default:
602 {
603 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
604 break;
605 }
606 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000607 }
608 else
609 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100610 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000611 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000612 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100613 }
614
Michalis Spyroucffb2a32020-09-08 16:26:38 +0100615 if(!is_nchw)
616 {
617 // Configure kernel window
618 Window win = calculate_max_window(*output->info(), Steps());
619 Coordinates coord;
620 coord.set_num_dimensions(output->info()->num_dimensions());
621 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
622 INEKernel::configure(win);
623 }
624 else
625 {
626 // Configure kernel window
627 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
628 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
629 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
630 INEKernel::configure(win_config.second);
631 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100632}
633
Sheri Zhang996c7772020-08-10 12:02:59 +0100634template <typename T>
Sheri Zhange0681992020-07-14 15:29:28 +0100635inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
636{
637 const int pad_left = info.padding().left;
638 const int pad_right = info.padding().right;
639 const int pad_top = info.padding().top;
640 const int pad_bottom = info.padding().bottom;
641 const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
642 const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
643 const int pad_horiz = pad_left + pad_right;
644 const int pad_vert = pad_top + pad_bottom;
645
646 if(info.data_layout() == DataLayout::NCHW)
647 {
648 const uint32_t offset_base = padded_offset
649 - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
650 - pad_top * sizeof(T) /* top padding */
651 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
652 - in_stride_w * id[3];
653
654 return offset_base;
655 }
656 else
657 {
658 const uint32_t offset_base = padded_offset
659 - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
660 - pad_top * sizeof(T) // top padding
661 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
662 - in_stride_w * id[3];
663
664 return offset_base;
665 }
666}
667
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000668template <typename T>
669void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000670{
671 Iterator input(_input, window_input);
672 Iterator output(_output, window);
673
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000674 /** NEON vector types */
675 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
676 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
677 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
678 using q16_t = typename wrapper::traits::promote_t<T>;
679 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
680 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
681 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
682
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000683 constexpr int pool_size = 2;
684 int pool_stride_x = 0;
685 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000686 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
687 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
688 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
689 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
690 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000691 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
692 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000693
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000694 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
695 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
Georgios Pinitas55186712018-01-08 17:37:12 +0000696
697 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
698
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100699 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
700 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
701 const bool have_different_qinfo = input_qinfo != output_qinfo;
702
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000703 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
704 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
705 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
706
Georgios Pinitas55186712018-01-08 17:37:12 +0000707 execute_window_loop(window, [&](const Coordinates & id)
708 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000709 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
710 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
711 q8x8_t lower_res = {};
712 q8x8_t upper_res = {};
Georgios Pinitas55186712018-01-08 17:37:12 +0000713
714 if(pooling_type != PoolingType::MAX)
715 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000716 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
717 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +0000718
719 // Add rows
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000720 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +0000721 {
722 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000723 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
724 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +0000725 }
726 };
727
728 // Pair-wise add row data
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000729 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
730 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +0000731
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000732 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
Georgios Pinitas55186712018-01-08 17:37:12 +0000733
734 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000735 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
736 pool_size, upper_bound_w, upper_bound_h,
737 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
738 lower_res = wrapper::vmovn(res_lower);
Georgios Pinitas55186712018-01-08 17:37:12 +0000739
740 // Compute upper result for stride_x == 1
741 if(pool_stride_x == 1)
742 {
743 // Shifted row sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000744 const q16x8x2_t vrsum_shifted =
Georgios Pinitas55186712018-01-08 17:37:12 +0000745 {
746 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000747 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
748 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +0000749 }
750 };
751
752 // Pair-wise add shifted row
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000753 q16x8_t res_upper = wrapper::vcombine(
754 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
755 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
Georgios Pinitas55186712018-01-08 17:37:12 +0000756
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000757 // Scale upper result
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000758 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
759 pool_size, upper_bound_w, upper_bound_h,
760 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
761 upper_res = wrapper::vmovn(res_upper);
Georgios Pinitas55186712018-01-08 17:37:12 +0000762 }
763 }
764 else
765 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000766 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
767 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
Georgios Pinitas55186712018-01-08 17:37:12 +0000768 if(pool_stride_x == 1)
769 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000770 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
771 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
Georgios Pinitas55186712018-01-08 17:37:12 +0000772 }
773 }
774
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100775 if(have_different_qinfo)
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100776 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +0000777 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000778 lower_res = wrapper::vgetlow(requantized_output);
779 upper_res = wrapper::vgethigh(requantized_output);
Pablo Telloa52e4cf2019-04-01 14:55:18 +0100780 }
781
Georgios Pinitas55186712018-01-08 17:37:12 +0000782 // Store result
783 if(pool_stride_x == 1)
784 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000785 const q8x8x2_t res = { { lower_res, upper_res } };
786 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000787 }
788 else
789 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000790 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
Georgios Pinitas55186712018-01-08 17:37:12 +0000791 }
792 },
793 input, output);
794}
795
Pablo Tello77e6c552018-12-04 15:33:49 +0000796void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100797{
Pablo Tello77e6c552018-12-04 15:33:49 +0000798 ARM_COMPUTE_UNUSED(pooling_type);
799 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000800#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100801 Iterator input(_input, window_input);
802 Iterator output(_output, window);
803
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000804 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000805 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
806 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
807 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
808 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000809 int pool_stride_x = 0;
810 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +0000811 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000812 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
813 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100814
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000815 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
816 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
817 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100818
819 execute_window_loop(window, [&](const Coordinates & id)
820 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100821 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
822 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
823 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
824 float16x4_t res = {};
825
826 // Get power of 2 in case of l2 pooling
827 if(pooling_type == PoolingType::L2)
828 {
829 top_data = vmul_f16(top_data, top_data);
830 middle_data = vmul_f16(middle_data, middle_data);
831 bottom_data = vmul_f16(bottom_data, bottom_data);
832 }
833
834 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100835 {
836 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000837 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100838 const float16x4_t scale_v = vdup_n_f16(scale);
839 // Perform pooling
840 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
841 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
842 res = vmul_f16(vpadd_f16(res, res), scale_v);
843 }
844 else
845 {
846 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
847 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
848 res = vpmax_f16(res, res);
849 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100850
851 // Calculate square-root in case of l2 pooling
852 if(pooling_type == PoolingType::L2)
853 {
854 res = vinv_f16(vinvsqrt_f16(res));
855 }
856
Pablo Tello0c34fe22017-06-26 17:17:42 +0100857 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
858 },
859 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000860#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100861 ARM_COMPUTE_UNUSED(window_input);
862 ARM_COMPUTE_UNUSED(window);
863 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000864#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100865}
866
Sheri Zhange0681992020-07-14 15:29:28 +0100867#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
868template <typename T>
869inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
870f16_to_f32(float16x4_t input)
871{
872 float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
873 return output;
874}
875#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
876
877template <typename T>
878inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
879f16_to_f32(float32x2_t input)
880{
881 return input;
882}
883
Sheri Zhang996c7772020-08-10 12:02:59 +0100884template <typename T>
Sheri Zhange0681992020-07-14 15:29:28 +0100885void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
886{
887 Iterator input(_input, window_input);
888 Iterator output(_output, window);
889 Iterator indices(_indices, window);
890 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
891 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
892 int pool_stride_x = 0;
893 int pool_stride_y = 0;
894 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
895 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
896 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
897 const int pad_left = _input->info()->padding().left;
898 const int pad_right = _input->info()->padding().right;
899 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
900
901 execute_window_loop(window, [&](const Coordinates & id)
902 {
903 auto top_data = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
904 auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
905 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
906 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
907
908 // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
909 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
910 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
911 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
912 *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
913
914 // Calculate max data indice, which will be used in max unpool.
915 const uint32_t offset_base = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
916 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
917 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
918 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
919 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
920 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
921 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
922 *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
923 },
924 input, output, indices);
925}
926
Pablo Tello77e6c552018-12-04 15:33:49 +0000927void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100928{
Pablo Tello77e6c552018-12-04 15:33:49 +0000929 ARM_COMPUTE_UNUSED(pooling_type);
930 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000931#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +0100932 if(pooling_type == PoolingType::MAX && _indices)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100933 {
Sheri Zhange0681992020-07-14 15:29:28 +0100934 pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
935 }
936 else
937 {
938 Iterator input(_input, window_input);
939 Iterator output(_output, window);
940 constexpr int pool_size = 2;
941 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
942 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
943 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
944 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
945 int pool_stride_x, pool_stride_y = 0;
946 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
947 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
948 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100949
Sheri Zhange0681992020-07-14 15:29:28 +0100950 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
951 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
952
953 execute_window_loop(window, [&](const Coordinates & id)
Georgios Pinitascdf51452017-08-31 14:21:36 +0100954 {
Sheri Zhange0681992020-07-14 15:29:28 +0100955 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
956 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
957 float16x4_t res = {};
Georgios Pinitascdf51452017-08-31 14:21:36 +0100958
Sheri Zhange0681992020-07-14 15:29:28 +0100959 // Get power of 2 in case of l2 pooling
960 if(pooling_type == PoolingType::L2)
961 {
962 top_data = vmul_f16(top_data, top_data);
963 bottom_data = vmul_f16(bottom_data, bottom_data);
964 }
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100965
Sheri Zhange0681992020-07-14 15:29:28 +0100966 if(pooling_type != PoolingType::MAX)
967 {
968 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
969 const float16x4_t scale_v = vdup_n_f16(scale);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100970
Sheri Zhange0681992020-07-14 15:29:28 +0100971 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
972 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
973 }
974 else
975 {
976 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
977 res = vpmax_f16(max_data, max_data);
978 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100979
Sheri Zhange0681992020-07-14 15:29:28 +0100980 // Calculate square-root in case of l2 pooling
981 if(pooling_type == PoolingType::L2)
982 {
983 res = vinv_f16(vinvsqrt_f16(res));
984 }
985
986 // Store result
987 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
988 },
989 input, output);
990 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000991#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100992 ARM_COMPUTE_UNUSED(window_input);
993 ARM_COMPUTE_UNUSED(window);
994 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000995#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100996}
997
Manuel Bottinib4bb8272019-12-18 18:01:27 +0000998template <typename T>
999void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001000{
1001 Iterator input(_input, window_input);
1002 Iterator output(_output, window);
1003
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001004 /** NEON vector types */
1005 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1006 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1007 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1008 using q16_t = typename wrapper::traits::promote_t<T>;
1009 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1010 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1011
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001012 constexpr int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001013 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1014 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1015 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1016 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001017 int pool_stride_x = 0;
1018 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001019 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001020 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1021 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001022
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01001023 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1024 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001025
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001026 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1027 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1028 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1029
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001030 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1031 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1032 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
Georgios Pinitas55186712018-01-08 17:37:12 +00001033
1034 execute_window_loop(window, [&](const Coordinates & id)
1035 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001036 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1037 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1038 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1039 q8x8_t fres = {};
1040 q8x16_t fqres = {};
Georgios Pinitas55186712018-01-08 17:37:12 +00001041
1042 if(pooling_type == PoolingType::AVG)
1043 {
1044 // Convert data to u16
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001045 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1046 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1047 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
Georgios Pinitas55186712018-01-08 17:37:12 +00001048
1049 // Calculate row sums
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001050 const q16x8x2_t vrsum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001051 {
1052 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001053 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1054 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001055 }
1056 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001057 const q16x8x2_t vrsum_shifted_1 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001058 {
1059 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001060 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1061 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001062 }
1063 };
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001064 const q16x8x2_t vrsum_shifted_2 =
Georgios Pinitas55186712018-01-08 17:37:12 +00001065 {
1066 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001067 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1068 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
Georgios Pinitas55186712018-01-08 17:37:12 +00001069 }
1070 };
1071 // Calculate final sum
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001072 q16x8x2_t final_sum =
Georgios Pinitas55186712018-01-08 17:37:12 +00001073 {
1074 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001075 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1076 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
Georgios Pinitas55186712018-01-08 17:37:12 +00001077 }
1078 };
1079 if(pool_stride_x == 2)
1080 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001081 q16x8_t res =
Georgios Pinitas55186712018-01-08 17:37:12 +00001082 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001083 wrapper::vgetlane(final_sum.val[0], 0),
1084 wrapper::vgetlane(final_sum.val[0], 2),
1085 wrapper::vgetlane(final_sum.val[0], 4),
1086 wrapper::vgetlane(final_sum.val[0], 6),
1087 wrapper::vgetlane(final_sum.val[1], 0),
1088 wrapper::vgetlane(final_sum.val[1], 2),
1089 wrapper::vgetlane(final_sum.val[1], 4),
1090 wrapper::vgetlane(final_sum.val[1], 6),
Georgios Pinitas55186712018-01-08 17:37:12 +00001091 };
1092
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001093 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1094 pool_size, upper_bound_w, upper_bound_h,
1095 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1096 fres = wrapper::vmovn(res);
Georgios Pinitas55186712018-01-08 17:37:12 +00001097 }
1098 else
1099 {
1100 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001101 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1102 pool_size, upper_bound_w, upper_bound_h,
1103 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001104 // Scale lower result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001105 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1106 pool_size, upper_bound_w, upper_bound_h,
1107 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1108 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
Georgios Pinitas55186712018-01-08 17:37:12 +00001109 }
1110 }
1111 else
1112 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001113 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1114 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1115 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1116 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
Georgios Pinitas55186712018-01-08 17:37:12 +00001117
1118 if(pool_stride_x == 2)
1119 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001120 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1121 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1122 fres = wrapper::vtbl(table, lookup_val);
Georgios Pinitas55186712018-01-08 17:37:12 +00001123 }
1124 else
1125 {
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001126 fqres = final_max;
Georgios Pinitas55186712018-01-08 17:37:12 +00001127 }
1128 }
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001129
1130 // Store result
1131 if(pool_stride_x == 1)
1132 {
1133 if(input_qinfo != output_qinfo)
1134 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001135 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001136 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001137 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001138 }
1139 else
1140 {
1141 if(input_qinfo != output_qinfo)
1142 {
Manuel Bottinicf4737a2020-02-06 11:58:51 +00001143 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001144 }
Manuel Bottinib4bb8272019-12-18 18:01:27 +00001145 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
Georgios Pinitasd66094e2019-04-15 15:44:17 +01001146 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001147 },
1148 input, output);
1149}
1150
Pablo Tello77e6c552018-12-04 15:33:49 +00001151void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001152{
Pablo Tello77e6c552018-12-04 15:33:49 +00001153 ARM_COMPUTE_UNUSED(pooling_type);
1154 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001155#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1156 Iterator input(_input, window_input);
1157 Iterator output(_output, window);
1158
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001159 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1160 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1161 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1162 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1163 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1164 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001165 int pool_stride_x = 0;
1166 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001167 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001168 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1169 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1170
1171 execute_window_loop(window, [&](const Coordinates & id)
1172 {
1173 float16_t res = 0.0f;
1174 float16x8_t vres = vdupq_n_f16(0.0f);
1175
1176 if(pooling_type != PoolingType::MAX)
1177 {
1178 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001179 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001180
1181 // Perform pooling
1182
1183 for(int y = 0; y < pool_size_y; ++y)
1184 {
1185 int x = 0;
1186 for(; x <= (pool_size_x - 8); x += 8)
1187 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001188 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1189 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001190
1191 // Get power of 2 in case of l2 pooling and accumulate
1192 if(pooling_type == PoolingType::L2)
1193 {
1194 vres = vaddq_f16(vres, vmulq_f16(data, data));
1195 }
1196 else
1197 {
1198 vres = vaddq_f16(vres, data);
1199 }
1200 }
1201
1202 // Leftover for loop
1203 for(; x < pool_size_x; ++x)
1204 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001205 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1206 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001207
1208 // Get power of 2 in case of l2 pooling
1209 if(pooling_type == PoolingType::L2)
1210 {
1211 data *= data;
1212 }
1213
1214 res += data;
1215 }
1216 }
1217
1218 // Reduction
1219 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1220 res += vget_lane_f16(tmp, 0);
1221 res += vget_lane_f16(tmp, 1);
1222 res += vget_lane_f16(tmp, 2);
1223 res += vget_lane_f16(tmp, 3);
1224
1225 // Divide by scale
1226 res *= scale;
1227 }
1228 else
1229 {
1230 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1231 res = std::numeric_limits<float>::lowest();
1232
1233 for(int y = 0; y < pool_size_y; ++y)
1234 {
1235 int x = 0;
1236 for(; x <= (pool_size_x - 8); x += 8)
1237 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001238 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1239 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001240 vres = vmaxq_f16(vres, data);
1241 }
1242
1243 // Leftover for loop
1244 for(; x < pool_size_x; ++x)
1245 {
Georgios Pinitas0922dbb2019-11-25 18:39:27 +00001246 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1247 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1248 res = std::max(res, data);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001249 }
1250 }
1251
1252 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1253 res = std::max(res, vget_lane_f16(tmp, 0));
1254 res = std::max(res, vget_lane_f16(tmp, 1));
1255 res = std::max(res, vget_lane_f16(tmp, 2));
1256 res = std::max(res, vget_lane_f16(tmp, 3));
1257 }
1258
1259 // Calculate square-root in case of l2 pooling
1260 if(pooling_type == PoolingType::L2)
1261 {
1262 res = std::sqrt(res);
1263 }
1264
1265 // Store result
1266 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1267 },
1268 input, output);
1269
1270#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1271 ARM_COMPUTE_UNUSED(window_input);
1272 ARM_COMPUTE_UNUSED(window);
1273 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1274#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1275}
1276
Sheri Zhange0681992020-07-14 15:29:28 +01001277#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1278void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1279{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001280 const int window_start_x = window.x().start();
1281 const int window_end_x = window.x().end();
1282 const int window_step_x = 8;
1283
1284 Window window_out = window;
1285 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1286
Sheri Zhange0681992020-07-14 15:29:28 +01001287 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001288 Iterator output(_output, window_out);
1289 Iterator indices(_indices, window_out);
Sheri Zhange0681992020-07-14 15:29:28 +01001290
1291 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1292 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1293
1294 int pool_stride_x = 0;
1295 int pool_stride_y = 0;
1296 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1297
1298 const int pad_right = _input->info()->padding().right;
1299 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1300 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1301
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001302 execute_window_loop(window_out, [&](const Coordinates & id)
Sheri Zhange0681992020-07-14 15:29:28 +01001303 {
1304 const int idx_width = id.y() * pool_stride_x;
1305 const int idx_height = id.z() * pool_stride_y;
1306 const int pool_limit_y = pool_pad_top - idx_height;
1307 const int pool_limit_x = pool_pad_left - idx_width;
1308
1309 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1310 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1311 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1312 (_input->info()->strides_in_bytes().z());
1313 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1314 (_input->info()->strides_in_bytes().z());
Sheri Zhange0681992020-07-14 15:29:28 +01001315 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1316 (_input->info()->strides_in_bytes().z());
Sheri Zhange0681992020-07-14 15:29:28 +01001317 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1318 (_input->info()->strides_in_bytes().z());
1319
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001320 int x_off = window_start_x;
Michalis Spyrouc2268532020-10-09 11:52:10 +01001321 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001322 {
1323 const auto in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
1324 const auto in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
1325 const auto in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
1326 const auto in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
1327 const auto v_x0 = vld1q_f16(in_x0_ptr);
1328 const auto v_x1 = vld1q_f16(in_x1_ptr);
1329 const auto v_x2 = vld1q_f16(in_x2_ptr);
1330 const auto v_x3 = vld1q_f16(in_x3_ptr);
1331 float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
1332 // Store result
1333 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
Sheri Zhange0681992020-07-14 15:29:28 +01001334
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001335 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1336 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1337 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1338 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1339 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1340 const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1341 const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
1342 const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
1343 const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1344 const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
1345 const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
1346 const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1347 const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
1348 const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
1349 const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1350 const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
1351 const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
1352 const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
1353 const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
1354 const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1355 const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
1356 const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
1357 // Store indicies
1358 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
1359 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
1360 }
1361
1362 // Left-overs loop
1363 for(; x_off < window_end_x; ++x_off)
1364 {
1365 const auto x0 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
1366 const auto x1 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
1367 const auto x2 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
1368 const auto x3 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
1369 float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
1370
1371 // Store result
1372 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1373
1374 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1375 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1376 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1377 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1378 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1379 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
1380 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
1381 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
1382
1383 // Store indices
1384 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
1385 }
Sheri Zhange0681992020-07-14 15:29:28 +01001386 },
1387 input, output, indices);
1388}
1389#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1390
Pablo Tello77e6c552018-12-04 15:33:49 +00001391void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001392{
Pablo Tello77e6c552018-12-04 15:33:49 +00001393 ARM_COMPUTE_UNUSED(pooling_type);
1394 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001395#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhange0681992020-07-14 15:29:28 +01001396 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1397 {
1398 pooling2_f16_nhwc_maxpool_indices(window_input, window);
1399 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001400 const int window_start_x = window.x().start();
1401 const int window_end_x = window.x().end();
1402 const int window_step_x = 8;
1403
1404 Window window_out = window;
1405 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1406
Michalis Spyrou57dac842018-03-01 16:03:50 +00001407 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001408 Iterator output(_output, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001409
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001410 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1411 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1412 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1413 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1414 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1415 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001416 int pool_stride_x = 0;
1417 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001418 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00001419 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1420 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1421
1422 float16x8_t vres;
1423
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001424 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001425 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001426 const int idx_width = id.y() * pool_stride_x;
1427 const int idx_height = id.z() * pool_stride_y;
1428 const int pool_limit_y = pool_pad_top - idx_height;
1429 const int pool_limit_x = pool_pad_left - idx_width;
1430
1431 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1432 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1433 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1434 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1435
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001436 int x_off = window_start_x;
Michalis Spyrouc2268532020-10-09 11:52:10 +01001437 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001438 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001439 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001440 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001441 // Calculate scale
1442 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1443 pool_stride_y);
1444 const float16x8_t scale_v = vdupq_n_f16(scale);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001445
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001446 // Perform pooling
1447 vres = vdupq_n_f16(0.0f);
1448 for(int y = pool_start_y; y < pool_end_y; ++y)
1449 {
1450 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001451 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001452 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1453 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1454
1455 // Get power of 2 in case of l2 pooling and accumulate
1456 if(pooling_type == PoolingType::L2)
1457 {
1458 vres = vaddq_f16(vres, vmulq_f16(data, data));
1459 }
1460 else
1461 {
1462 vres = vaddq_f16(vres, data);
1463 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001464 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001465 }
1466 // Divide by scale
1467 vres = vmulq_f16(vres, scale_v);
1468 }
1469 else
1470 {
1471 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1472
1473 for(int y = pool_start_y; y < pool_end_y; ++y)
1474 {
1475 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001476 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001477 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1478 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1479 vres = vmaxq_f16(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001480 }
1481 }
1482 }
Michalis Spyrouced25572018-10-01 16:26:20 +01001483
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001484 // Calculate square-root in case of l2 pooling
1485 if(pooling_type == PoolingType::L2)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001486 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001487 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1488 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1489 }
1490
1491 // Store result
1492 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1493 }
1494
1495 // Left-overs loop
1496 for(; x_off < window_end_x; ++x_off)
1497 {
1498 float16_t res = 0.0f;
1499
1500 if(pooling_type != PoolingType::MAX)
1501 {
1502 // Calculate scale
1503 const float16_t scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1504 pool_stride_y);
1505
1506 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001507 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001508 for(int x = pool_start_x; x < pool_end_x; ++x)
1509 {
1510 const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1511 (_input->info()->strides_in_bytes().z())) + x_off);
1512
1513 // Get power of 2 in case of l2 pooling and accumulate
1514 if(pooling_type == PoolingType::L2)
1515 {
1516 res += data * data;
1517 }
1518 else
1519 {
1520 res += data;
1521 }
1522 }
1523 }
1524
1525 // Divide by scale
1526 res *= scale;
1527 }
1528 else
1529 {
1530 res = std::numeric_limits<float>::lowest();
1531 for(int y = pool_start_y; y < pool_end_y; ++y)
1532 {
1533 for(int x = pool_start_x; x < pool_end_x; ++x)
1534 {
1535 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1536 (_input->info()->strides_in_bytes().z())) + x_off);
1537 res = std::max(res, data);
1538 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001539 }
1540 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001541
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001542 // Calculate square-root in case of l2 pooling
1543 if(pooling_type == PoolingType::L2)
1544 {
1545 res = std::sqrt(res);
1546 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001547
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001548 // Store result
1549 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1550 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001551 },
1552 input, output);
1553
1554#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1555 ARM_COMPUTE_UNUSED(window_input);
1556 ARM_COMPUTE_UNUSED(window);
1557 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1558#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1559}
1560
Pablo Tello77e6c552018-12-04 15:33:49 +00001561void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001562{
1563 Iterator input(_input, window_input);
1564 Iterator output(_output, window);
1565
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001566 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1567 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1568 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1569 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1570 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1571 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001572 int pool_stride_x = 0;
1573 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001574 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001575 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1576 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001577
1578 execute_window_loop(window, [&](const Coordinates & id)
1579 {
1580 float res = 0.0f;
1581
1582 if(pooling_type != PoolingType::MAX)
1583 {
1584 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001585 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001586
1587 // Perform pooling
1588 float32x4_t vres = vdupq_n_f32(0.0f);
1589
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001590 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001591 {
1592 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001593 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001594 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001595 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1596 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001597
1598 // Get power of 2 in case of l2 pooling and accumulate
1599 if(pooling_type == PoolingType::L2)
1600 {
1601 vres = vmlaq_f32(vres, data, data);
1602 }
1603 else
1604 {
1605 vres = vaddq_f32(vres, data);
1606 }
1607 }
1608
1609 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001610 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001611 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001612 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1613 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001614
1615 // Get power of 2 in case of l2 pooling
1616 if(pooling_type == PoolingType::L2)
1617 {
1618 data *= data;
1619 }
1620
1621 res += data;
1622 }
1623 }
1624
1625#if defined(__aarch64__)
1626 // Reduction operation available on 64 bit architectures only
1627 res += vaddvq_f32(vres);
1628#else // __aarch64__
1629 // Reduction
1630 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1631 tmp = vpadd_f32(tmp, tmp);
1632
1633 res += vget_lane_f32(tmp, 0);
1634#endif // __aarch64__
1635 // Divide by scale
1636 res *= scale;
1637 }
1638 else
1639 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001640 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1641 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001642
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001643 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001644 {
1645 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001646 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001647 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001648 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1649 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001650 vres = vmaxq_f32(vres, data);
1651 }
1652
1653 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001654 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001655 {
Michalis Spyrou7c60c992019-10-10 14:33:47 +01001656 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1657 (_input->info()->strides_in_bytes().y())));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001658 res = std::max(res, data);
1659 }
1660 }
Gian Marco Iodice16824302017-09-28 15:41:37 +01001661#if defined(__aarch64__)
1662 // Reduction operation available on 64 bit architectures only
1663 res = std::max(vmaxvq_f32(vres), res);
1664#else // __aarch64__
1665 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1666 tmp = vpmax_f32(tmp, tmp);
1667
1668 res = std::max(res, vget_lane_f32(tmp, 0));
1669#endif // __aarch64__
1670 }
1671
1672 // Calculate square-root in case of l2 pooling
1673 if(pooling_type == PoolingType::L2)
1674 {
1675 res = std::sqrt(res);
1676 }
1677
1678 // Store result
1679 *(reinterpret_cast<float *>(output.ptr())) = res;
1680 },
1681 input, output);
1682}
1683
morgolockcc1f6c92020-03-24 09:26:48 +00001684void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1685 bool exclude_padding)
1686{
1687 if(pooling_type == PoolingType::MAX && _indices)
1688 {
Sheri Zhange0681992020-07-14 15:29:28 +01001689 pooling2_nchw_maxpool_indices<float>(window_input, window);
morgolockcc1f6c92020-03-24 09:26:48 +00001690 }
1691 else
1692 {
1693 Iterator input(_input, window_input);
1694 Iterator output(_output, window);
1695 constexpr int pool_size = 2;
1696 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1697 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1698 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1699 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1700 int pool_stride_x = 0;
1701 int pool_stride_y = 0;
1702 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1703 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1704 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1705
1706 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1707 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1708
1709 execute_window_loop(window, [&](const Coordinates & id)
1710 {
1711 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1712 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1713 float32x2_t top_data = vld1_f32(in_top_ptr);
1714 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1715 float32x2_t res = {};
1716 float final_res = 0;
1717 // Get power of 2 in case of l2 pooling
1718 if(pooling_type == PoolingType::L2)
1719 {
1720 top_data = vmul_f32(top_data, top_data);
1721 bottom_data = vmul_f32(bottom_data, bottom_data);
1722 }
1723
1724 if(pooling_type != PoolingType::MAX)
1725 {
1726 // Calculate scale
1727 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1728 const float32x2_t scale_v = vdup_n_f32(scale);
1729
1730 // Perform pooling
1731 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1732 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1733 }
1734 else
1735 {
1736 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1737 res = vpmax_f32(max_data, max_data);
1738 }
1739 final_res = vget_lane_f32(res, 0);
1740
1741 // Calculate square-root in case of l2 pooling
1742 if(pooling_type == PoolingType::L2)
1743 {
1744 final_res = sqrt(final_res);
1745 }
1746
1747 // Store result
1748 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1749 },
1750 input, output);
1751 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001752}
1753
1754void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1755{
1756 Iterator input(_input, window_input);
1757 Iterator output(_output, window);
1758
1759 constexpr const int pool_size = 3;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001760 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1761 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1762 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1763 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001764 int pool_stride_x = 0;
1765 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001766 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001767 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1768 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1769
1770 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1771 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1772 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1773
1774 execute_window_loop(window, [&](const Coordinates & id)
1775 {
1776 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1777 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1778 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1779 float32x2_t res = {};
1780 float final_res = 0;
1781
1782 // Get power of 2 in case of l2 pooling
1783 if(pooling_type == PoolingType::L2)
1784 {
1785 top_data = vmulq_f32(top_data, top_data);
1786 middle_data = vmulq_f32(middle_data, middle_data);
1787 bottom_data = vmulq_f32(bottom_data, bottom_data);
1788 }
1789
1790 if(pooling_type != PoolingType::MAX)
1791 {
1792 // Calculate scale
1793 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1794 const float32x2_t scale_v = vdup_n_f32(scale);
1795
1796 // Perform pooling
1797 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1798 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1799 res = vmul_f32(vpadd_f32(res, res), scale_v);
1800 }
1801 else
1802 {
1803 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1804 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1805 res = vpmax_f32(res, res);
1806 }
1807 final_res = vget_lane_f32(res, 0);
1808
1809 // Calculate square-root in case of l2 pooling
1810 if(pooling_type == PoolingType::L2)
1811 {
1812 final_res = sqrt(final_res);
1813 }
1814
1815 // Store result
1816 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1817 },
1818 input, output);
1819}
1820
1821void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1822{
1823 Iterator input(_input, window_input);
1824 Iterator output(_output, window);
1825
1826 constexpr const int pool_size = 7;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001827 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1828 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1829 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1830 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Pablo Tello77e6c552018-12-04 15:33:49 +00001831 int pool_stride_x = 0;
1832 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00001833 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Pablo Tello77e6c552018-12-04 15:33:49 +00001834 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1835 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1836
1837 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1838 for(int i = 0; i < pool_size; ++i)
1839 {
1840 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1841 }
1842
1843 execute_window_loop(window, [&](const Coordinates & id)
1844 {
1845 float32x2_t res = {};
1846 float final_res = 0.f;
1847 if(pooling_type != PoolingType::MAX)
1848 {
1849 // Calculate scale
1850 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1851 const float32x2_t scale_v = vdup_n_f32(scale);
1852
1853 // Perform pooling
1854 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1855 // Get power of 2 in case of l2 pooling
1856 if(pooling_type == PoolingType::L2)
1857 {
1858 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1859 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1860 }
1861 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1862 for(int i = 1; i < pool_size; ++i)
1863 {
1864 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1865 // Get power of 2 in case of l2 pooling
1866 if(pooling_type == PoolingType::L2)
1867 {
1868 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1869 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1870 }
1871 sum_data = vaddq_f32(sum_data, data.val[0]);
1872 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1873 }
1874 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1875 res = vmul_f32(vpadd_f32(res, res), scale_v);
1876 }
1877 else
1878 {
1879 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1880 for(int i = 1; i < pool_size; ++i)
1881 {
1882 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1883 max_data = vmax2q_f32(max_data, data);
1884 }
1885 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1886 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1887 res = vpmax_f32(res, res);
1888 }
1889 final_res = vget_lane_f32(res, 0);
1890
1891 // Calculate square-root in case of l2 pooling
1892 if(pooling_type == PoolingType::L2)
1893 {
1894 final_res = sqrt(final_res);
1895 }
1896
1897 // Store result
1898 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1899 },
1900 input, output);
1901}
1902
1903void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001904{
morgolocke383c352020-04-03 16:57:46 +01001905 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1906 {
1907 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1908 }
1909 else
1910 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001911 const int window_start_x = window.x().start();
1912 const int window_end_x = window.x().end();
1913 const int window_step_x = 4;
1914
1915 Window window_out = window;
1916 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1917
morgolocke383c352020-04-03 16:57:46 +01001918 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001919 Iterator output(_output, window_out);
morgolocke383c352020-04-03 16:57:46 +01001920
1921 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1922 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1923 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1924 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1925 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1926 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1927 int pool_stride_x = 0;
1928 int pool_stride_y = 0;
1929 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1930 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1931 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1932
1933 float32x4_t vres;
1934
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001935 execute_window_loop(window_out, [&](const Coordinates & id)
morgolocke383c352020-04-03 16:57:46 +01001936 {
1937 const int idx_width = id.y() * pool_stride_x;
1938 const int idx_height = id.z() * pool_stride_y;
1939 const int pool_limit_y = pool_pad_top - idx_height;
1940 const int pool_limit_x = pool_pad_left - idx_width;
1941
1942 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1943 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1944 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1945 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1946
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001947 int x_off = window_start_x;
Michalis Spyrouc2268532020-10-09 11:52:10 +01001948 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
morgolocke383c352020-04-03 16:57:46 +01001949 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001950 if(pooling_type != PoolingType::MAX)
morgolocke383c352020-04-03 16:57:46 +01001951 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001952 // Calculate scale
1953 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1954 pool_stride_y);
1955 const float32x4_t scale_v = vdupq_n_f32(scale);
morgolocke383c352020-04-03 16:57:46 +01001956
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001957 // Perform pooling
1958 vres = vdupq_n_f32(0.0f);
1959
1960 for(int y = pool_start_y; y < pool_end_y; ++y)
1961 {
1962 for(int x = pool_start_x; x < pool_end_x; ++x)
morgolocke383c352020-04-03 16:57:46 +01001963 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001964 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1965 (_input->info()->strides_in_bytes().z())) + x_off);
1966
1967 // Get power of 2 in case of l2 pooling and accumulate
1968 if(pooling_type == PoolingType::L2)
1969 {
1970 vres = vmlaq_f32(vres, data, data);
1971 }
1972 else
1973 {
1974 vres = vaddq_f32(vres, data);
1975 }
morgolocke383c352020-04-03 16:57:46 +01001976 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001977 }
1978 // Divide by scale
1979 vres = vmulq_f32(vres, scale_v);
1980 }
1981 else
1982 {
1983 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1984 for(int y = pool_start_y; y < pool_end_y; ++y)
1985 {
1986 for(int x = pool_start_x; x < pool_end_x; ++x)
morgolocke383c352020-04-03 16:57:46 +01001987 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001988 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1989 (_input->info()->strides_in_bytes().z())) + x_off);
1990 vres = vmaxq_f32(vres, data);
morgolocke383c352020-04-03 16:57:46 +01001991 }
1992 }
1993 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001994
1995 // Calculate square-root in case of l2 pooling
1996 if(pooling_type == PoolingType::L2)
morgolocke383c352020-04-03 16:57:46 +01001997 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01001998 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1999 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
2000 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
2001 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
2002 };
2003 vres = l2_res;
2004 }
2005
2006 // Store result
2007 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2008 }
2009
2010 // Left-overs loop
2011 for(; x_off < window_end_x; ++x_off)
2012 {
2013 float res = 0.0f;
2014
2015 if(pooling_type != PoolingType::MAX)
2016 {
2017 // Calculate scale
2018 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2019 pool_stride_y);
2020
2021 for(int y = pool_start_y; y < pool_end_y; ++y)
morgolocke383c352020-04-03 16:57:46 +01002022 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002023 for(int x = pool_start_x; x < pool_end_x; ++x)
2024 {
2025 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2026 (_input->info()->strides_in_bytes().z())) + x_off);
2027
2028 // Get power of 2 in case of l2 pooling and accumulate
2029 if(pooling_type == PoolingType::L2)
2030 {
2031 res += data * data;
2032 }
2033 else
2034 {
2035 res += data;
2036 }
2037 }
2038 }
2039
2040 // Divide by scale
2041 res *= scale;
2042 }
2043 else
2044 {
2045 res = std::numeric_limits<float>::lowest();
2046 for(int y = pool_start_y; y < pool_end_y; ++y)
2047 {
2048 for(int x = pool_start_x; x < pool_end_x; ++x)
2049 {
2050 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2051 (_input->info()->strides_in_bytes().z())) + x_off);
2052 res = std::max(res, data);
2053 }
morgolocke383c352020-04-03 16:57:46 +01002054 }
2055 }
morgolocke383c352020-04-03 16:57:46 +01002056
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002057 // Calculate square-root in case of l2 pooling
2058 if(pooling_type == PoolingType::L2)
2059 {
2060 res = std::sqrt(res);
2061 }
morgolocke383c352020-04-03 16:57:46 +01002062
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002063 // Store result
2064 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2065 }
morgolocke383c352020-04-03 16:57:46 +01002066 },
2067 input, output);
2068 }
2069}
2070
2071void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
2072{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002073 const int window_start_x = window.x().start();
2074 const int window_end_x = window.x().end();
2075 const int window_step_x = 4;
2076
2077 Window window_out = window;
2078 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2079
Michalis Spyrou57dac842018-03-01 16:03:50 +00002080 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002081 Iterator output(_output, window_out);
2082 Iterator indices(_indices, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002083
morgolocke383c352020-04-03 16:57:46 +01002084 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2085 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2086
2087 int pool_stride_x = 0;
2088 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002089 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002090
2091 float32x4_t vres;
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002092 float res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002093
morgolocke383c352020-04-03 16:57:46 +01002094 const int pad_right = _input->info()->padding().right;
morgolocke383c352020-04-03 16:57:46 +01002095 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
2096 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002097
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002098 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002099 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002100 const int idx_width = id.y() * pool_stride_x;
2101 const int idx_height = id.z() * pool_stride_y;
2102 const int pool_limit_y = pool_pad_top - idx_height;
2103 const int pool_limit_x = pool_pad_left - idx_width;
2104
2105 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
Michalis Spyrouced25572018-10-01 16:26:20 +01002106 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002107
morgolocke383c352020-04-03 16:57:46 +01002108 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2109 (_input->info()->strides_in_bytes().z());
2110 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2111 (_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002112 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2113 (_input->info()->strides_in_bytes().z());
morgolocke383c352020-04-03 16:57:46 +01002114 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2115 (_input->info()->strides_in_bytes().z());
Michalis Spyrou57dac842018-03-01 16:03:50 +00002116
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002117 int x_off = window_start_x;
Michalis Spyrouc2268532020-10-09 11:52:10 +01002118 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002119 {
2120 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
2121 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
2122 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
2123 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
2124 const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
2125 const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
2126 const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
2127 const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
2128 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
2129 // Store result
2130 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
morgolocke383c352020-04-03 16:57:46 +01002131
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002132 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2133 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2134 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2135 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2136 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2137 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
2138 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
2139 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
2140 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
2141 const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
2142 const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
2143 const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
2144
2145 // Store indices
2146 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
2147 }
2148
2149 // Left-overs loop
2150 for(; x_off < window_end_x; ++x_off)
2151 {
2152 const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
2153 const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
2154 const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
2155 const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
2156 res = std::max(std::max(x2, x3), std::max(x0, x1));
2157
2158 // Store result
2159 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2160
2161 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2162 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2163 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2164 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2165 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2166 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
2167 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
2168 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
2169
2170 // Store indices
2171 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
2172 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002173 },
morgolocke383c352020-04-03 16:57:46 +01002174 input, output, indices);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002175}
2176
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002177template <typename T>
2178void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00002179{
2180 Iterator input(_input, window_input);
2181 Iterator output(_output, window);
2182
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002183 /** NEON vector types */
2184 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2185 using q16_t = typename wrapper::traits::promote_t<T>;
2186 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2187 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2188 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2189
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002190 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
2191 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
2192 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2193 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2194 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2195 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002196 int pool_stride_x = 0;
2197 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002198 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00002199 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
2200 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00002201
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002202 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
2203 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
2204
Georgios Pinitas55186712018-01-08 17:37:12 +00002205 execute_window_loop(window, [&](const Coordinates & id)
2206 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002207 T res = std::numeric_limits<T>::min();
Georgios Pinitas55186712018-01-08 17:37:12 +00002208
2209 if(pooling_type != PoolingType::MAX)
2210 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002211 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2212 q32_t sres = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00002213
2214 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00002215 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00002216
2217 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002218 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002219 {
2220 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002221 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002222 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002223 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2224 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002225
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002226 const q16x8_t data_q16 = wrapper::vmovl(data);
2227 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
Georgios Pinitas55186712018-01-08 17:37:12 +00002228 }
2229
2230 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002231 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002232 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002233 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2234 (_input->info()->strides_in_bytes().y())));
Georgios Pinitas55186712018-01-08 17:37:12 +00002235 sres += data;
2236 }
2237 }
2238
2239 // Reduction
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002240 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2241 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
Georgios Pinitas55186712018-01-08 17:37:12 +00002242
2243 // Divide by scale
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002244 res = static_cast<T>(support::cpp11::round(sres * scale));
Georgios Pinitas55186712018-01-08 17:37:12 +00002245 }
2246 else
2247 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002248 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
Georgios Pinitas55186712018-01-08 17:37:12 +00002249
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002250 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00002251 {
2252 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002253 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00002254 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002255 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2256 (_input->info()->strides_in_bytes().y())));
2257 vres = wrapper::vmax(vres, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002258 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002259 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002260 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00002261 {
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002262 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2263 (_input->info()->strides_in_bytes().y())));
2264 res = std::max(res, data);
Georgios Pinitas55186712018-01-08 17:37:12 +00002265 }
2266 }
2267
2268 // Reduce max
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002269 vres = wrapper::vpmax(vres, vres);
2270 vres = wrapper::vpmax(vres, vres);
2271 vres = wrapper::vpmax(vres, vres);
Georgios Pinitas55186712018-01-08 17:37:12 +00002272
2273 // Get max value
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002274 res = std::max(res, wrapper::vgetlane(vres, 0));
Georgios Pinitas55186712018-01-08 17:37:12 +00002275 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002276 // Store result
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002277 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2278 *(reinterpret_cast<T *>(output.ptr())) = res;
Georgios Pinitas55186712018-01-08 17:37:12 +00002279 },
2280 input, output);
2281}
2282
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002283template <typename T>
2284void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002285{
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002286 const int window_start_x = window.x().start();
2287 const int window_end_x = window.x().end();
2288 const int window_step_x = 16;
2289
2290 Window window_out = window;
2291 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2292
Michalis Spyrou57dac842018-03-01 16:03:50 +00002293 Iterator input(_input, window_input);
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002294 Iterator output(_output, window_out);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002295
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002296 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2297 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2298 using q16_t = typename wrapper::traits::promote_t<T>;
2299 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2300 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2301 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2302
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002303 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2304 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2305 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2306 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2307 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2308 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002309
2310 int pool_stride_x = 0;
2311 int pool_stride_y = 0;
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002312 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002313 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2314 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2315
Georgios Pinitas4c5469b2019-05-21 13:32:43 +01002316 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2317 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2318 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
Georgios Pinitas283fc602018-11-09 10:46:43 +00002319
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002320 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002321 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2322 // With a requantization performed in a single step there won't be uncertainties introduced
Michele Di Giorgio82fa5502020-02-19 15:55:01 +00002323 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002324
2325 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2326 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2327 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2328
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002329 execute_window_loop(window_out, [&](const Coordinates & id)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002330 {
Michalis Spyrouced25572018-10-01 16:26:20 +01002331 const int idx_width = id.y() * pool_stride_x;
2332 const int idx_height = id.z() * pool_stride_y;
2333 const int pool_limit_y = pool_pad_top - idx_height;
2334 const int pool_limit_x = pool_pad_left - idx_width;
2335
2336 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2337 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2338 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2339 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2340
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002341 int x_off = window_start_x;
Michalis Spyrouc2268532020-10-09 11:52:10 +01002342 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002343 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002344 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002345 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002346 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2347 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2348 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2349 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002350
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002351 // Calculate scale
2352 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2353 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002354
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002355 // Perform pooling
2356 for(int y = pool_start_y; y < pool_end_y; ++y)
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002357 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002358 for(int x = pool_start_x; x < pool_end_x; ++x)
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002359 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002360 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2361 (_input->info()->strides_in_bytes().z())) + x_off);
2362
2363 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2364 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2365 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2366 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2367 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2368 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002369 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002370 }
2371
2372 if(input_qinfo != output_qinfo)
2373 {
2374 const float32x4x4_t vres =
2375 {
2376 {
2377 vcvtq_f32_q32(vres1),
2378 vcvtq_f32_q32(vres2),
2379 vcvtq_f32_q32(vres3),
2380 vcvtq_f32_q32(vres4),
2381 }
2382 };
2383 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2384 // Store result
2385 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
2386 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
2387 }
2388 else
2389 {
2390 const float32x4_t scale_v = vdupq_n_f32(scale);
2391 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2392 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2393 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2394 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2395 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
2396
2397 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2398 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2399 // Store result
2400 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
2401 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
2402 }
Pablo Telloa52e4cf2019-04-01 14:55:18 +01002403 }
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002404 else
2405 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002406 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
Michalis Spyrou57dac842018-03-01 16:03:50 +00002407
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002408 for(int y = pool_start_y; y < pool_end_y; ++y)
2409 {
2410 for(int x = pool_start_x; x < pool_end_x; ++x)
2411 {
2412 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2413 (_input->info()->strides_in_bytes().z())) + x_off);
2414 vres = wrapper::vmax(vres, data);
2415 }
2416 }
2417
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002418 // Store result
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002419 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
Georgios Pinitasddb93bb2020-10-02 16:38:59 +01002420 requant_qinfo) :
2421 vres);
Manuel Bottinicf4737a2020-02-06 11:58:51 +00002422 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002423 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002424
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002425 // Left-overs loop
2426 for(; x_off < window_end_x; ++x_off)
2427 {
2428 if(pooling_type != PoolingType::MAX)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002429 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002430 q32_t res = static_cast<q32_t>(0.f);
2431
2432 // Calculate scale
2433 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2434 pool_stride_y);
2435
2436 // Perform pooling
2437 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00002438 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002439 for(int x = pool_start_x; x < pool_end_x; ++x)
2440 {
2441 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2442 (_input->info()->strides_in_bytes().z())) + x_off);
2443 res += data;
2444 }
2445 }
2446
2447 if(input_qinfo != output_qinfo)
2448 {
2449 const float res_f = static_cast<float>(res);
2450 const float new_scale = quant_rescale / scale;
2451 const auto requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
2452
2453 // Store result
2454 *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
2455 }
2456 else
2457 {
2458 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2459 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
2460
2461 // Store result
2462 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002463 }
2464 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002465 else
2466 {
2467 T res = std::numeric_limits<T>::min();
Michalis Spyrou57dac842018-03-01 16:03:50 +00002468
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002469 for(int y = pool_start_y; y < pool_end_y; ++y)
2470 {
2471 for(int x = pool_start_x; x < pool_end_x; ++x)
2472 {
2473 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2474 (_input->info()->strides_in_bytes().z())) + x_off);
2475 res = std::max(res, data);
2476 }
2477 }
2478
2479 // Store result
2480 if(input_qinfo != output_qinfo)
2481 {
2482 const float res_f = static_cast<float>(res);
2483 *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
2484 }
2485 else
2486 {
2487 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2488 }
2489 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002490 }
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002491
Michalis Spyrou57dac842018-03-01 16:03:50 +00002492 },
2493 input, output);
2494}
2495
morgolockcc1f6c92020-03-24 09:26:48 +00002496Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002497{
2498 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2499
2500 unsigned int pooled_w = 0;
2501 unsigned int pooled_h = 0;
2502 unsigned int num_elems_processed_per_iteration = 0;
2503 BorderSize border_size(0);
2504
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002505 const bool is_global_pooling = pool_info.is_global_pooling;
Michalis Spyrou57dac842018-03-01 16:03:50 +00002506 unsigned int pool_size_x = 0;
2507 unsigned int pool_size_y = 0;
2508
2509 // Get data layout
Sang-Hoon Park11fedda2020-01-15 14:44:04 +00002510 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2511 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2512 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrou57dac842018-03-01 16:03:50 +00002513
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002514 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2515 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002516
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002517 // Validate pool info before calling scaled_dimensions
2518 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002519
2520 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00002521 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2522 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002523 pool_size_x,
2524 pool_size_y,
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002525 pool_info.pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002526
morgolockcc1f6c92020-03-24 09:26:48 +00002527 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2528 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2529 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00002530 pool_size_x, pool_size_y)
2531 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00002532
2533 return Status{};
2534}
2535
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002536void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002537{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01002538 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002539 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2540 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2541 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2542
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002543 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2544 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2545 const unsigned int pool_size = _pool_info.pool_size.width;
2546 const bool exclude_padding = _pool_info.exclude_padding;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002547
Michalis Spyrou57dac842018-03-01 16:03:50 +00002548 Window window_input(window);
Georgios Pinitas14d9d982019-12-13 12:33:09 +00002549 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002550 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002551 // Set step for input in x and y direction for the input
2552 unsigned int window_x_inc = 0;
2553 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01002554 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00002555 case DataType::QASYMM8:
Manuel Bottinib4bb8272019-12-18 18:01:27 +00002556 case DataType::QASYMM8_SIGNED:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002557 {
2558 window_x_inc = pool_stride_x;
2559 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2560 {
2561 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2562 }
2563 break;
2564 }
Pablo Tello77e6c552018-12-04 15:33:49 +00002565
Georgios Pinitas13d96e02018-08-23 11:20:23 +01002566 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00002567 case DataType::F32:
2568 {
2569 window_x_inc = pool_stride_x;
2570 break;
2571 }
2572 default:
2573 {
2574 ARM_COMPUTE_ERROR("Not supported");
2575 }
Georgios Pinitas55186712018-01-08 17:37:12 +00002576 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002577 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2578 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002579 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00002580 else
2581 {
Michalis Spyroucffb2a32020-09-08 16:26:38 +01002582 window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
Michalis Spyrou57dac842018-03-01 16:03:50 +00002583 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2584 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2585 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002586
2587 // Run function
Sang-Hoon Park0cb3da62020-01-15 12:39:56 +00002588 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002589}
morgolockcc1f6c92020-03-24 09:26:48 +00002590} // namespace arm_compute