blob: 244741c947b586ca3528de95fef7ea12eee25355 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas55186712018-01-08 17:37:12 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitas55186712018-01-08 17:37:12 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitascdf51452017-08-31 14:21:36 +010033#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Utils.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/core/Window.h"
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010038#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Georgios Pinitas55186712018-01-08 17:37:12 +000040#include "support/ToolchainSupport.h"
41
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042#include <algorithm>
43#include <arm_neon.h>
Georgios Pinitascdf51452017-08-31 14:21:36 +010044#include <cmath>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045#include <limits>
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +010046#include <set>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047#include <string>
48#include <tuple>
49
50using namespace arm_compute;
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +010051using namespace misc::shape_calculator;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052
53namespace
54{
Pablo Tello77e6c552018-12-04 15:33:49 +000055inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Anthony Barbier6ff3b192017-09-04 18:44:23 +010056 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
57{
Michalis Spyrou57dac842018-03-01 16:03:50 +000058 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
59 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
60
61 int start_x = id[idx_width] * stride_x - pad_x;
62 int start_y = id[idx_height] * stride_y - pad_y;
63
64 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
65 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +000066 if(exclude_padding)
67 {
68 start_x = std::max(0, start_x);
69 start_y = std::max(0, start_y);
70 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071 return 1.f / ((end_y - start_y) * (end_x - start_x));
72}
73
Pablo Tello77e6c552018-12-04 15:33:49 +000074inline void scale_vector_s16x8(bool exclude_padding, uint16x8_t &v, const Coordinates &id, int id_offset, int step,
Georgios Pinitas55186712018-01-08 17:37:12 +000075 const int pool_size, const int upper_bound_w, const int upper_bound_h,
76 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
77{
78 int start_x = (id.x() + id_offset) * stride_x - pad_x;
79 int start_y = id.y() * stride_y - pad_y;
80 const int end_y = std::min(start_y + pool_size, upper_bound_h);
81 if(exclude_padding)
82 {
83 start_y = std::max(0, start_y);
84 }
85
86 std::array<uint16_t, 8> elems =
87 {
88 {
89 vgetq_lane_u16(v, 0),
90 vgetq_lane_u16(v, 1),
91 vgetq_lane_u16(v, 2),
92 vgetq_lane_u16(v, 3),
93 vgetq_lane_u16(v, 4),
94 vgetq_lane_u16(v, 5),
95 vgetq_lane_u16(v, 6),
96 vgetq_lane_u16(v, 7),
97 }
98 };
99
100 for(auto &el : elems)
101 {
102 int c_start_x = start_x;
103 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
104 if(exclude_padding)
105 {
106 c_start_x = std::max(0, c_start_x);
107 }
108 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
109 el *= scale;
110 start_x += step * stride_x;
111 }
112
113 v = vsetq_lane_u16(elems[0], v, 0);
114 v = vsetq_lane_u16(elems[1], v, 1);
115 v = vsetq_lane_u16(elems[2], v, 2);
116 v = vsetq_lane_u16(elems[3], v, 3);
117 v = vsetq_lane_u16(elems[4], v, 4);
118 v = vsetq_lane_u16(elems[5], v, 5);
119 v = vsetq_lane_u16(elems[6], v, 6);
120 v = vsetq_lane_u16(elems[7], v, 7);
121}
122
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100123Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000125 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100126
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000127 int pool_stride_x = 0;
128 int pool_stride_y = 0;
129 PoolingType pool_type = pool_info.pool_type();
130 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100131 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +0100132
Anthony Barbiereaefd002018-07-20 17:49:35 +0100133 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100134 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
Georgios Pinitas55186712018-01-08 17:37:12 +0000135 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000136
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000137 if(output->total_size() != 0)
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100138 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000139 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Michalis Spyrou57dac842018-03-01 16:03:50 +0000140 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
141 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
142 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
Georgios Pinitas1dad50e2017-07-03 17:51:34 +0100143 }
144
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000145 return Status{};
146}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100147
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000148Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000149{
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000150 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
151 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000152
153 return Status{};
154}
155
156std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
157 BorderSize &border_size,
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000158 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000159{
Giorgio Arena9fb6c7e2018-08-22 12:15:25 +0100160 // Output auto inizialitation if not yet initialized
161 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
162
Michalis Spyrou57dac842018-03-01 16:03:50 +0000163 DataLayout data_layout = input->data_layout();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000164 unsigned int num_elems_read_per_iteration = 0;
165 unsigned int num_elems_horizontal_window = 0;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000166 int pool_stride_x = 0;
167 int pool_stride_y = 0;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000168 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
169 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
170 const int input_width = input->dimension(idx_width);
171 const int input_height = input->dimension(idx_height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000172 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
173 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000174 const int pool_pad_right = pad_stride_info.pad_right();
175 const int pool_pad_top = pad_stride_info.pad_top();
176 const int pool_pad_left = pad_stride_info.pad_left();
177 const int pool_pad_bottom = pad_stride_info.pad_bottom();
178 const bool is_square = pool_size_x == pool_size_y;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000179
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000180 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +0000181 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
182 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000183 pool_size_x,
184 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000185 pad_stride_info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000187 //If it's not squared and optimized will be executed the MxN
188 num_elems_read_per_iteration = 1;
189 num_elems_processed_per_iteration = 1;
190 num_elems_horizontal_window = 1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100191
Michalis Spyrou57dac842018-03-01 16:03:50 +0000192 const bool is_nhwc = data_layout == DataLayout::NHWC;
193
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000194 if(is_square)
195 {
196 switch(input->data_type())
197 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000198 case DataType::QASYMM8:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000199 if(is_nhwc)
200 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100201 num_elems_processed_per_iteration = 16;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000202 break;
203 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000204 switch(pool_size_x)
205 {
206 case 2:
207 num_elems_read_per_iteration = 16;
208 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
209 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
210 break;
211 case 3:
212 num_elems_read_per_iteration = 16;
213 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
214 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
215 break;
216 default:
217 break;
218 }
219 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000220#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
221 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000222 if(is_nhwc)
223 {
224 num_elems_processed_per_iteration = 8;
225 break;
226 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000227 switch(pool_size_x)
228 {
229 case 2:
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000230 case 3:
231 num_elems_read_per_iteration = 4;
232 num_elems_processed_per_iteration = 1;
233 num_elems_horizontal_window = 1;
234 break;
235 default:
236 break;
237 }
238 break;
239#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
240 case DataType::F32:
Michalis Spyrou57dac842018-03-01 16:03:50 +0000241 if(is_nhwc)
242 {
Georgios Pinitas64f1a902018-09-18 13:42:51 +0100243 num_elems_processed_per_iteration = 4;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000244 break;
245 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000246 switch(pool_size_x)
247 {
248 case 2:
249 num_elems_read_per_iteration = 2;
250 break;
251 case 3:
252 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
253 break;
254 case 7:
255 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
256 break;
257 default:
258 break;
259 }
260 num_elems_processed_per_iteration = 1;
261 num_elems_horizontal_window = 1;
262 break;
263 default:
264 ARM_COMPUTE_ERROR("Element size not supported");
265 break;
266 }
267 }
Michalis Spyrou57dac842018-03-01 16:03:50 +0000268 else
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000269 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000270 if(is_nhwc)
271 {
Michalis Spyrouced25572018-10-01 16:26:20 +0100272 num_elems_processed_per_iteration = 16 / input->element_size();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000273 }
274 }
275
276 bool window_changed = false;
277 Window win{};
278 if(data_layout == DataLayout::NCHW)
279 {
280 // Number of iterations in X dimension
281 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
282
283 // Upper limit for the number of right/bottom border elements that are accessed
284 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
285 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
286
287 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
288 border_size.right = std::max(upper_bound_w, pool_pad_right);
289 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
290
291 TensorShape output_shape{ input->tensor_shape() };
292 output_shape.set(0, pooled_w);
293 output_shape.set(1, pooled_h);
294 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
295
296 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
297 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
298
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000299 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
300 window_changed = update_window_and_padding(win, input_access, output_access);
301 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
302 }
303 else
304 {
Michalis Spyrou57dac842018-03-01 16:03:50 +0000305 TensorShape output_shape{ input->tensor_shape() };
306 output_shape.set(1, pooled_w);
307 output_shape.set(2, pooled_h);
308 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
309
310 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
311 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
312
313 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
314 window_changed = update_window_and_padding(win, input_access, output_access);
315 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000316 }
317
318 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
319 return std::make_pair(err, win);
320}
321} // namespace
322
323NEPoolingLayerKernel::NEPoolingLayerKernel()
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000324 : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000325{
326}
327
328BorderSize NEPoolingLayerKernel::border_size() const
329{
330 return _border_size;
331}
332
333void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
334{
335 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
336
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000337 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000338 const bool is_global_pooling = pool_info.is_global_pooling();
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000339 const int pool_stride_x = pad_stride_info.stride().first;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000340
341 // Get data layout
342 const DataLayout data_layout = input->info()->data_layout();
343 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
344 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000345
346 // Update pool size in case of global pooling
Pablo Tello77e6c552018-12-04 15:33:49 +0000347 const Size2D pool_size(
348 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width,
349 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000350
351 // Validate pool info before calling scaled_dimensions
Pablo Tello77e6c552018-12-04 15:33:49 +0000352 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000353
354 // Check output dimensions
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000355 unsigned int pooled_w, pooled_h;
Michalis Spyrou57dac842018-03-01 16:03:50 +0000356 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
357 input->info()->dimension(idx_height),
Pablo Tello77e6c552018-12-04 15:33:49 +0000358 pool_size.x(),
359 pool_size.y(),
Diego Lopez Recas61ef5bf2017-12-11 12:36:55 +0000360 pad_stride_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000361
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000362 // Perform validation step
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100363 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364
365 // Set instance variables
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000366 _input = input;
367 _output = output;
368 _pool_info = pool_info;
Pablo Tello77e6c552018-12-04 15:33:49 +0000369 _is_square = (pool_size.x() == pool_size.y());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100370
Georgios Pinitas55186712018-01-08 17:37:12 +0000371 // Get data type
372 const DataType data_type = input->info()->data_type();
Michalis Spyrou57dac842018-03-01 16:03:50 +0000373 const bool is_nchw = data_layout == DataLayout::NCHW;
Georgios Pinitas55186712018-01-08 17:37:12 +0000374
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100375 if(data_type == DataType::QASYMM8)
Georgios Pinitas55186712018-01-08 17:37:12 +0000376 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000377 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000378 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000379 if(is_nchw)
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100380 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000381 _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw;
382 }
383 else
384 {
385 _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
Georgios Pinitas55186712018-01-08 17:37:12 +0000386 }
387 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000388 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000389 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000390 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000391 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000392 _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw;
393 }
394 else
395 {
396 _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
Georgios Pinitas55186712018-01-08 17:37:12 +0000397 }
398 }
399 else
400 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000401 if(is_nchw)
Georgios Pinitas55186712018-01-08 17:37:12 +0000402 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000403 _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw;
404 }
405 else
406 {
407 _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
Georgios Pinitas55186712018-01-08 17:37:12 +0000408 }
409 }
410 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000411 else if(data_type == DataType::F16)
412 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000413 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000414 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000415 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000416 {
417 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000418 {
419 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000420 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000421 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000422 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000423 else
424 {
425 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
426 }
427 }
428 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000429 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000430 {
431 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000432 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000433 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000434 }
Pablo Tello77e6c552018-12-04 15:33:49 +0000435 else
436 {
437 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
438 }
439 }
440 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000441 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000442 {
443 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000444 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000445 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
446 }
447 else
448 {
449 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000450 }
451 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000452 }
453 break;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000454 }
455 }
456 else
457 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000458 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000459 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000460 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
461 }
462 else
463 {
464 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000465 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000466 }
467 }
468 else if(data_type == DataType::F32)
469 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000470 if(_is_square)
Georgios Pinitas55186712018-01-08 17:37:12 +0000471 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000472 switch(pool_size.x())
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000473 {
474 case 2:
Pablo Tello77e6c552018-12-04 15:33:49 +0000475 {
476 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000477 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000478 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
479 }
480 else
481 {
482 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000483 }
484 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000485 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000486 case 3:
Pablo Tello77e6c552018-12-04 15:33:49 +0000487 {
488 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000489 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000490 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
491 }
492 else
493 {
494 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000495 }
496 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000497 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000498 case 7:
Pablo Tello77e6c552018-12-04 15:33:49 +0000499 {
500 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000501 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000502 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
503 }
504 else
505 {
506 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000507 }
508 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000509 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000510 default:
Pablo Tello77e6c552018-12-04 15:33:49 +0000511 {
512 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000513 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000514 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
515 }
516 else
517 {
518 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000519 }
520 break;
Pablo Tello77e6c552018-12-04 15:33:49 +0000521 }
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000522 }
523 }
524 else
525 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000526 if(is_nchw)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000527 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000528 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
529 }
530 else
531 {
532 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000533 }
Georgios Pinitas55186712018-01-08 17:37:12 +0000534 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100535 }
536
537 // Configure kernel window
Pablo Tello77e6c552018-12-04 15:33:49 +0000538 auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000539 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
540 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100541}
542
Pablo Tello77e6c552018-12-04 15:33:49 +0000543void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000544{
545 Iterator input(_input, window_input);
546 Iterator output(_output, window);
547
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000548 constexpr int pool_size = 2;
549 int pool_stride_x = 0;
550 int pool_stride_y = 0;
551 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
552 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
553 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
554 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
Georgios Pinitas55186712018-01-08 17:37:12 +0000555 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000556 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
557 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000558
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000559 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
560 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Georgios Pinitas55186712018-01-08 17:37:12 +0000561
562 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
563
564 execute_window_loop(window, [&](const Coordinates & id)
565 {
566 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
567 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
568 uint8x8_t lower_res = {};
569 uint8x8_t upper_res = {};
570
571 if(pooling_type != PoolingType::MAX)
572 {
573 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
574 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
575
576 // Add rows
577 const uint16x8x2_t vrsum =
578 {
579 {
580 vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
581 vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
582 }
583 };
584
585 // Pair-wise add row data
586 const uint16x4x2_t vpsum =
587 {
588 {
589 vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
590 vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
591 }
592 };
593
594 uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
595
596 // Scale lower result
Pablo Tello77e6c552018-12-04 15:33:49 +0000597 scale_vector_s16x8(exclude_padding, res_lower, id, 0, scale_step_x,
598 pool_size, upper_bound_w, upper_bound_h,
599 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000600 lower_res = vmovn_u16(res_lower);
601
602 // Compute upper result for stride_x == 1
603 if(pool_stride_x == 1)
604 {
605 // Shifted row sum
606 const uint16x8x2_t vrsum_shifted =
607 {
608 {
609 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
610 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
611 }
612 };
613
614 // Pair-wise add shifted row
615 const uint16x4x2_t vpsum_shifted =
616 {
617 {
618 vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
619 vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
620 }
621 };
622 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
623
624 // Scale lower result
Pablo Tello77e6c552018-12-04 15:33:49 +0000625 scale_vector_s16x8(exclude_padding, res_upper, id, 1, 2,
626 pool_size, upper_bound_w, upper_bound_h,
627 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000628 upper_res = vmovn_u16(res_upper);
629 }
630 }
631 else
632 {
633 const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
634 lower_res = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
635 if(pool_stride_x == 1)
636 {
637 const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
638 upper_res = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
639 }
640 }
641
642 // Store result
643 if(pool_stride_x == 1)
644 {
645 const uint8x8x2_t res = { { lower_res, upper_res } };
646 vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
647 }
648 else
649 {
650 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
651 }
652 },
653 input, output);
654}
655
Pablo Tello77e6c552018-12-04 15:33:49 +0000656void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100657{
Pablo Tello77e6c552018-12-04 15:33:49 +0000658 ARM_COMPUTE_UNUSED(pooling_type);
659 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000660#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100661 Iterator input(_input, window_input);
662 Iterator output(_output, window);
663
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000664 constexpr const int pool_size = 3;
665 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
666 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
667 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
668 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
669 int pool_stride_x = 0;
670 int pool_stride_y = 0;
Pablo Tello0c34fe22017-06-26 17:17:42 +0100671 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000672 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
673 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100674
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000675 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
676 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
677 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100678
679 execute_window_loop(window, [&](const Coordinates & id)
680 {
Georgios Pinitascdf51452017-08-31 14:21:36 +0100681 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
682 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
683 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
684 float16x4_t res = {};
685
686 // Get power of 2 in case of l2 pooling
687 if(pooling_type == PoolingType::L2)
688 {
689 top_data = vmul_f16(top_data, top_data);
690 middle_data = vmul_f16(middle_data, middle_data);
691 bottom_data = vmul_f16(bottom_data, bottom_data);
692 }
693
694 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100695 {
696 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000697 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100698 const float16x4_t scale_v = vdup_n_f16(scale);
699 // Perform pooling
700 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
701 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
702 res = vmul_f16(vpadd_f16(res, res), scale_v);
703 }
704 else
705 {
706 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
707 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
708 res = vpmax_f16(res, res);
709 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100710
711 // Calculate square-root in case of l2 pooling
712 if(pooling_type == PoolingType::L2)
713 {
714 res = vinv_f16(vinvsqrt_f16(res));
715 }
716
Pablo Tello0c34fe22017-06-26 17:17:42 +0100717 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
718 },
719 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000720#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100721 ARM_COMPUTE_UNUSED(window_input);
722 ARM_COMPUTE_UNUSED(window);
723 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000724#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100725}
726
Pablo Tello77e6c552018-12-04 15:33:49 +0000727void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100728{
Pablo Tello77e6c552018-12-04 15:33:49 +0000729 ARM_COMPUTE_UNUSED(pooling_type);
730 ARM_COMPUTE_UNUSED(exclude_padding);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000731#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0c34fe22017-06-26 17:17:42 +0100732 Iterator input(_input, window_input);
733 Iterator output(_output, window);
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000734 constexpr int pool_size = 2;
735 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
736 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
737 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
738 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
739 int pool_stride_x, pool_stride_y = 0;
740 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
741 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
742 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100743
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000744 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
745 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
Pablo Tello0c34fe22017-06-26 17:17:42 +0100746
747 execute_window_loop(window, [&](const Coordinates & id)
748 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100749 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
750 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
751 float16x4_t res = {};
Pablo Tello0c34fe22017-06-26 17:17:42 +0100752
Georgios Pinitascdf51452017-08-31 14:21:36 +0100753 // Get power of 2 in case of l2 pooling
754 if(pooling_type == PoolingType::L2)
755 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100756 top_data = vmul_f16(top_data, top_data);
757 bottom_data = vmul_f16(bottom_data, bottom_data);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100758 }
759
760 if(pooling_type != PoolingType::MAX)
Pablo Tello0c34fe22017-06-26 17:17:42 +0100761 {
Pablo Tello77e6c552018-12-04 15:33:49 +0000762 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100763 const float16x4_t scale_v = vdup_n_f16(scale);
764
765 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
766 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100767 }
768 else
769 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100770 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
771 res = vpmax_f16(max_data, max_data);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100772 }
Georgios Pinitascdf51452017-08-31 14:21:36 +0100773
774 // Calculate square-root in case of l2 pooling
775 if(pooling_type == PoolingType::L2)
776 {
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100777 res = vinv_f16(vinvsqrt_f16(res));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100778 }
779
780 // Store result
Georgios Pinitas13d96e02018-08-23 11:20:23 +0100781 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
Pablo Tello0c34fe22017-06-26 17:17:42 +0100782 },
783 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000784#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100785 ARM_COMPUTE_UNUSED(window_input);
786 ARM_COMPUTE_UNUSED(window);
787 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000788#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0c34fe22017-06-26 17:17:42 +0100789}
790
Pablo Tello77e6c552018-12-04 15:33:49 +0000791void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +0000792{
793 Iterator input(_input, window_input);
794 Iterator output(_output, window);
795
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000796 constexpr int pool_size = 3;
797 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
798 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
799 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
800 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
801 int pool_stride_x = 0;
802 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +0000803 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000804 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
805 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +0000806
Michalis Spyroubd0e6122018-01-23 09:52:16 +0000807 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
808 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
809 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
Georgios Pinitas55186712018-01-08 17:37:12 +0000810
811 execute_window_loop(window, [&](const Coordinates & id)
812 {
813 const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
814 const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
815 const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
816
817 if(pooling_type == PoolingType::AVG)
818 {
819 // Convert data to u16
820 const uint16x8x2_t top_data_u16 = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
821 const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
822 const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
823
824 // Calculate row sums
825 const uint16x8x2_t vrsum =
826 {
827 {
828 vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
829 vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
830 }
831 };
832 const uint16x8x2_t vrsum_shifted_1 =
833 {
834 {
835 vextq_u16(vrsum.val[0], vrsum.val[1], 1),
836 vextq_u16(vrsum.val[1], vrsum.val[1], 1)
837 }
838 };
839 const uint16x8x2_t vrsum_shifted_2 =
840 {
841 {
842 vextq_u16(vrsum.val[0], vrsum.val[1], 2),
843 vextq_u16(vrsum.val[1], vrsum.val[1], 2)
844 }
845 };
846 // Calculate final sum
847 uint16x8x2_t final_sum =
848 {
849 {
850 vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
851 vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
852 }
853 };
854 if(pool_stride_x == 2)
855 {
856 uint16x8_t res =
857 {
858 vgetq_lane_u16(final_sum.val[0], 0),
859 vgetq_lane_u16(final_sum.val[0], 2),
860 vgetq_lane_u16(final_sum.val[0], 4),
861 vgetq_lane_u16(final_sum.val[0], 6),
862 vgetq_lane_u16(final_sum.val[1], 0),
863 vgetq_lane_u16(final_sum.val[1], 2),
864 vgetq_lane_u16(final_sum.val[1], 4),
865 vgetq_lane_u16(final_sum.val[1], 6),
866 };
867
Pablo Tello77e6c552018-12-04 15:33:49 +0000868 scale_vector_s16x8(exclude_padding, res, id, 0, 1,
869 pool_size, upper_bound_w, upper_bound_h,
870 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000871 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
872 }
873 else
874 {
875 // Scale lower result
Pablo Tello77e6c552018-12-04 15:33:49 +0000876 scale_vector_s16x8(exclude_padding, final_sum.val[0], id, 0, 1,
877 pool_size, upper_bound_w, upper_bound_h,
878 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000879 // Scale lower result
Pablo Tello77e6c552018-12-04 15:33:49 +0000880 scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
881 pool_size, upper_bound_w, upper_bound_h,
882 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +0000883 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
884 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
885 }
886 }
887 else
888 {
889 const uint8x16_t max_data = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
890 const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
891 const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
892 const uint8x16_t final_max = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
893
894 if(pool_stride_x == 2)
895 {
896 const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
897 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
898 const uint8x8_t res = vtbl2_u8(table, lookup_val);
899 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
900 }
901 else
902 {
903 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
904 }
905 }
906 },
907 input, output);
908}
909
Pablo Tello77e6c552018-12-04 15:33:49 +0000910void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100911{
Pablo Tello77e6c552018-12-04 15:33:49 +0000912 ARM_COMPUTE_UNUSED(pooling_type);
913 ARM_COMPUTE_UNUSED(exclude_padding);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000914#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
915 Iterator input(_input, window_input);
916 Iterator output(_output, window);
917
918 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
919 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
920 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
921 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
922 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
923 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
924 int pool_stride_x = 0;
925 int pool_stride_y = 0;
926 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
927 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
928 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
929
930 execute_window_loop(window, [&](const Coordinates & id)
931 {
932 float16_t res = 0.0f;
933 float16x8_t vres = vdupq_n_f16(0.0f);
934
935 if(pooling_type != PoolingType::MAX)
936 {
937 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +0000938 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Isabella Gottardi7567f5f2018-01-30 15:26:00 +0000939
940 // Perform pooling
941
942 for(int y = 0; y < pool_size_y; ++y)
943 {
944 int x = 0;
945 for(; x <= (pool_size_x - 8); x += 8)
946 {
947 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
948 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
949
950 // Get power of 2 in case of l2 pooling and accumulate
951 if(pooling_type == PoolingType::L2)
952 {
953 vres = vaddq_f16(vres, vmulq_f16(data, data));
954 }
955 else
956 {
957 vres = vaddq_f16(vres, data);
958 }
959 }
960
961 // Leftover for loop
962 for(; x < pool_size_x; ++x)
963 {
964 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
965
966 // Get power of 2 in case of l2 pooling
967 if(pooling_type == PoolingType::L2)
968 {
969 data *= data;
970 }
971
972 res += data;
973 }
974 }
975
976 // Reduction
977 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
978 res += vget_lane_f16(tmp, 0);
979 res += vget_lane_f16(tmp, 1);
980 res += vget_lane_f16(tmp, 2);
981 res += vget_lane_f16(tmp, 3);
982
983 // Divide by scale
984 res *= scale;
985 }
986 else
987 {
988 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
989 res = std::numeric_limits<float>::lowest();
990
991 for(int y = 0; y < pool_size_y; ++y)
992 {
993 int x = 0;
994 for(; x <= (pool_size_x - 8); x += 8)
995 {
996 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
997 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
998 vres = vmaxq_f16(vres, data);
999 }
1000
1001 // Leftover for loop
1002 for(; x < pool_size_x; ++x)
1003 {
1004 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
1005 res = std::max(res, data);
1006 }
1007 }
1008
1009 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1010 res = std::max(res, vget_lane_f16(tmp, 0));
1011 res = std::max(res, vget_lane_f16(tmp, 1));
1012 res = std::max(res, vget_lane_f16(tmp, 2));
1013 res = std::max(res, vget_lane_f16(tmp, 3));
1014 }
1015
1016 // Calculate square-root in case of l2 pooling
1017 if(pooling_type == PoolingType::L2)
1018 {
1019 res = std::sqrt(res);
1020 }
1021
1022 // Store result
1023 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1024 },
1025 input, output);
1026
1027#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1028 ARM_COMPUTE_UNUSED(window_input);
1029 ARM_COMPUTE_UNUSED(window);
1030 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1031#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1032}
1033
Pablo Tello77e6c552018-12-04 15:33:49 +00001034void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001035{
Pablo Tello77e6c552018-12-04 15:33:49 +00001036 ARM_COMPUTE_UNUSED(pooling_type);
1037 ARM_COMPUTE_UNUSED(exclude_padding);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001038#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1039 Iterator input(_input, window_input);
1040 Iterator output(_output, window);
1041
1042 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
1043 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
1044 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1045 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1046 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1047 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1048 int pool_stride_x = 0;
1049 int pool_stride_y = 0;
1050 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1051 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1052 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1053
1054 float16x8_t vres;
1055
1056 execute_window_loop(window, [&](const Coordinates & id)
1057 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001058 const int idx_width = id.y() * pool_stride_x;
1059 const int idx_height = id.z() * pool_stride_y;
1060 const int pool_limit_y = pool_pad_top - idx_height;
1061 const int pool_limit_x = pool_pad_left - idx_width;
1062
1063 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1064 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1065 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1066 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1067
Michalis Spyrou57dac842018-03-01 16:03:50 +00001068 if(pooling_type != PoolingType::MAX)
1069 {
1070 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001071 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1072 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001073 const float16x8_t scale_v = vdupq_n_f16(scale);
1074
1075 // Perform pooling
1076 vres = vdupq_n_f16(0.0f);
Michalis Spyrouced25572018-10-01 16:26:20 +01001077 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001078 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001079 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001080 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001081 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1082 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
1083
1084 // Get power of 2 in case of l2 pooling and accumulate
1085 if(pooling_type == PoolingType::L2)
1086 {
1087 vres = vaddq_f16(vres, vmulq_f16(data, data));
1088 }
1089 else
1090 {
1091 vres = vaddq_f16(vres, data);
1092 }
1093 }
1094 }
1095 // Divide by scale
1096 vres = vmulq_f16(vres, scale_v);
1097 }
1098 else
1099 {
1100 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001101
1102 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001103 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001104 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001105 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001106 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1107 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
1108 vres = vmaxq_f16(vres, data);
1109 }
1110 }
1111 }
1112
1113 // Calculate square-root in case of l2 pooling
1114 if(pooling_type == PoolingType::L2)
1115 {
1116 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1117 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1118 }
1119
1120 // Store result
1121 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
1122 },
1123 input, output);
1124
1125#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1126 ARM_COMPUTE_UNUSED(window_input);
1127 ARM_COMPUTE_UNUSED(window);
1128 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1129#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1130}
1131
Pablo Tello77e6c552018-12-04 15:33:49 +00001132void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001133{
1134 Iterator input(_input, window_input);
1135 Iterator output(_output, window);
1136
1137 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1138 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001139 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1140 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1141 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1142 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1143 int pool_stride_x = 0;
1144 int pool_stride_y = 0;
Gian Marco Iodice16824302017-09-28 15:41:37 +01001145 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001146 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1147 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001148
1149 execute_window_loop(window, [&](const Coordinates & id)
1150 {
1151 float res = 0.0f;
1152
1153 if(pooling_type != PoolingType::MAX)
1154 {
1155 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001156 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Gian Marco Iodice16824302017-09-28 15:41:37 +01001157
1158 // Perform pooling
1159 float32x4_t vres = vdupq_n_f32(0.0f);
1160
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001161 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001162 {
1163 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001164 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001165 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001166 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1167 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001168
1169 // Get power of 2 in case of l2 pooling and accumulate
1170 if(pooling_type == PoolingType::L2)
1171 {
1172 vres = vmlaq_f32(vres, data, data);
1173 }
1174 else
1175 {
1176 vres = vaddq_f32(vres, data);
1177 }
1178 }
1179
1180 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001181 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001182 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001183 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001184
1185 // Get power of 2 in case of l2 pooling
1186 if(pooling_type == PoolingType::L2)
1187 {
1188 data *= data;
1189 }
1190
1191 res += data;
1192 }
1193 }
1194
1195#if defined(__aarch64__)
1196 // Reduction operation available on 64 bit architectures only
1197 res += vaddvq_f32(vres);
1198#else // __aarch64__
1199 // Reduction
1200 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1201 tmp = vpadd_f32(tmp, tmp);
1202
1203 res += vget_lane_f32(tmp, 0);
1204#endif // __aarch64__
1205 // Divide by scale
1206 res *= scale;
1207 }
1208 else
1209 {
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001210 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1211 res = std::numeric_limits<float>::lowest();
Gian Marco Iodice16824302017-09-28 15:41:37 +01001212
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001213 for(int y = 0; y < pool_size_y; ++y)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001214 {
1215 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001216 for(; x <= (pool_size_x - 4); x += 4)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001217 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001218 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1219 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001220 vres = vmaxq_f32(vres, data);
1221 }
1222
1223 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001224 for(; x < pool_size_x; ++x)
Gian Marco Iodice16824302017-09-28 15:41:37 +01001225 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001226 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Gian Marco Iodice16824302017-09-28 15:41:37 +01001227 res = std::max(res, data);
1228 }
1229 }
1230
1231#if defined(__aarch64__)
1232 // Reduction operation available on 64 bit architectures only
1233 res = std::max(vmaxvq_f32(vres), res);
1234#else // __aarch64__
1235 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1236 tmp = vpmax_f32(tmp, tmp);
1237
1238 res = std::max(res, vget_lane_f32(tmp, 0));
1239#endif // __aarch64__
1240 }
1241
1242 // Calculate square-root in case of l2 pooling
1243 if(pooling_type == PoolingType::L2)
1244 {
1245 res = std::sqrt(res);
1246 }
1247
1248 // Store result
1249 *(reinterpret_cast<float *>(output.ptr())) = res;
1250 },
1251 input, output);
1252}
1253
Pablo Tello77e6c552018-12-04 15:33:49 +00001254void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1255{
1256 Iterator input(_input, window_input);
1257 Iterator output(_output, window);
1258
1259 constexpr int pool_size = 2;
1260 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1261 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1262 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1263 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1264 int pool_stride_x = 0;
1265 int pool_stride_y = 0;
1266 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1267 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1268 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1269
1270 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1271 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1272
1273 execute_window_loop(window, [&](const Coordinates & id)
1274 {
1275 float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1276 float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1277 float32x2_t res = {};
1278 float final_res = 0;
1279
1280 // Get power of 2 in case of l2 pooling
1281 if(pooling_type == PoolingType::L2)
1282 {
1283 top_data = vmul_f32(top_data, top_data);
1284 bottom_data = vmul_f32(bottom_data, bottom_data);
1285 }
1286
1287 if(pooling_type != PoolingType::MAX)
1288 {
1289 // Calculate scale
1290 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1291 const float32x2_t scale_v = vdup_n_f32(scale);
1292
1293 // Perform pooling
1294 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1295 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1296 }
1297 else
1298 {
1299 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1300 res = vpmax_f32(max_data, max_data);
1301 }
1302 final_res = vget_lane_f32(res, 0);
1303
1304 // Calculate square-root in case of l2 pooling
1305 if(pooling_type == PoolingType::L2)
1306 {
1307 final_res = sqrt(final_res);
1308 }
1309
1310 // Store result
1311 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1312 },
1313 input, output);
1314}
1315
1316void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1317{
1318 Iterator input(_input, window_input);
1319 Iterator output(_output, window);
1320
1321 constexpr const int pool_size = 3;
1322 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1323 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1324 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1325 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1326 int pool_stride_x = 0;
1327 int pool_stride_y = 0;
1328 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1329 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1330 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1331
1332 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1333 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1334 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1335
1336 execute_window_loop(window, [&](const Coordinates & id)
1337 {
1338 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1339 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1340 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1341 float32x2_t res = {};
1342 float final_res = 0;
1343
1344 // Get power of 2 in case of l2 pooling
1345 if(pooling_type == PoolingType::L2)
1346 {
1347 top_data = vmulq_f32(top_data, top_data);
1348 middle_data = vmulq_f32(middle_data, middle_data);
1349 bottom_data = vmulq_f32(bottom_data, bottom_data);
1350 }
1351
1352 if(pooling_type != PoolingType::MAX)
1353 {
1354 // Calculate scale
1355 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1356 const float32x2_t scale_v = vdup_n_f32(scale);
1357
1358 // Perform pooling
1359 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1360 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1361 res = vmul_f32(vpadd_f32(res, res), scale_v);
1362 }
1363 else
1364 {
1365 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1366 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1367 res = vpmax_f32(res, res);
1368 }
1369 final_res = vget_lane_f32(res, 0);
1370
1371 // Calculate square-root in case of l2 pooling
1372 if(pooling_type == PoolingType::L2)
1373 {
1374 final_res = sqrt(final_res);
1375 }
1376
1377 // Store result
1378 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1379 },
1380 input, output);
1381}
1382
1383void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1384{
1385 Iterator input(_input, window_input);
1386 Iterator output(_output, window);
1387
1388 constexpr const int pool_size = 7;
1389 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1390 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1391 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1392 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1393 int pool_stride_x = 0;
1394 int pool_stride_y = 0;
1395 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1396 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1397 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1398
1399 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1400 for(int i = 0; i < pool_size; ++i)
1401 {
1402 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1403 }
1404
1405 execute_window_loop(window, [&](const Coordinates & id)
1406 {
1407 float32x2_t res = {};
1408 float final_res = 0.f;
1409 if(pooling_type != PoolingType::MAX)
1410 {
1411 // Calculate scale
1412 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1413 const float32x2_t scale_v = vdup_n_f32(scale);
1414
1415 // Perform pooling
1416 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1417 // Get power of 2 in case of l2 pooling
1418 if(pooling_type == PoolingType::L2)
1419 {
1420 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1421 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1422 }
1423 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1424 for(int i = 1; i < pool_size; ++i)
1425 {
1426 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1427 // Get power of 2 in case of l2 pooling
1428 if(pooling_type == PoolingType::L2)
1429 {
1430 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1431 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1432 }
1433 sum_data = vaddq_f32(sum_data, data.val[0]);
1434 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1435 }
1436 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1437 res = vmul_f32(vpadd_f32(res, res), scale_v);
1438 }
1439 else
1440 {
1441 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1442 for(int i = 1; i < pool_size; ++i)
1443 {
1444 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1445 max_data = vmax2q_f32(max_data, data);
1446 }
1447 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1448 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1449 res = vpmax_f32(res, res);
1450 }
1451 final_res = vget_lane_f32(res, 0);
1452
1453 // Calculate square-root in case of l2 pooling
1454 if(pooling_type == PoolingType::L2)
1455 {
1456 final_res = sqrt(final_res);
1457 }
1458
1459 // Store result
1460 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1461 },
1462 input, output);
1463}
1464
1465void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001466{
1467 Iterator input(_input, window_input);
1468 Iterator output(_output, window);
1469
1470 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
1471 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
1472 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1473 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1474 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1475 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1476 int pool_stride_x = 0;
1477 int pool_stride_y = 0;
1478 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1479 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1480 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1481
1482 float32x4_t vres;
1483
1484 execute_window_loop(window, [&](const Coordinates & id)
1485 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001486 const int idx_width = id.y() * pool_stride_x;
1487 const int idx_height = id.z() * pool_stride_y;
1488 const int pool_limit_y = pool_pad_top - idx_height;
1489 const int pool_limit_x = pool_pad_left - idx_width;
1490
1491 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1492 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1493 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1494 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1495
Michalis Spyrou57dac842018-03-01 16:03:50 +00001496 if(pooling_type != PoolingType::MAX)
1497 {
1498 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001499 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1500 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001501 const float32x4_t scale_v = vdupq_n_f32(scale);
1502
1503 // Perform pooling
1504 vres = vdupq_n_f32(0.0f);
1505
Michalis Spyrouced25572018-10-01 16:26:20 +01001506 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001507 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001508 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001509 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001510 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1511 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
1512
1513 // Get power of 2 in case of l2 pooling and accumulate
1514 if(pooling_type == PoolingType::L2)
1515 {
1516 vres = vmlaq_f32(vres, data, data);
1517 }
1518 else
1519 {
1520 vres = vaddq_f32(vres, data);
1521 }
1522 }
1523 }
1524 // Divide by scale
1525 vres = vmulq_f32(vres, scale_v);
1526 }
1527 else
1528 {
1529 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
Michalis Spyrouced25572018-10-01 16:26:20 +01001530 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001531 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001532 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001533 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001534 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1535 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
1536 vres = vmaxq_f32(vres, data);
1537 }
1538 }
1539 }
1540
1541 // Calculate square-root in case of l2 pooling
1542 if(pooling_type == PoolingType::L2)
1543 {
Georgios Pinitas027ce5b2018-11-08 18:55:36 +00001544 vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001545 }
1546
1547 // Store result
1548 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
1549 },
1550 input, output);
1551}
1552
Pablo Tello77e6c552018-12-04 15:33:49 +00001553void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Georgios Pinitas55186712018-01-08 17:37:12 +00001554{
1555 Iterator input(_input, window_input);
1556 Iterator output(_output, window);
1557
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001558 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
1559 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001560 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1561 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1562 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1563 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1564 int pool_stride_x = 0;
1565 int pool_stride_y = 0;
Georgios Pinitas55186712018-01-08 17:37:12 +00001566 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001567 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1568 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
Georgios Pinitas55186712018-01-08 17:37:12 +00001569
1570 execute_window_loop(window, [&](const Coordinates & id)
1571 {
1572 uint8_t res = 0;
1573
1574 if(pooling_type != PoolingType::MAX)
1575 {
1576 uint32x4_t vres = vdupq_n_u32(0);
1577 uint32_t sres = 0;
1578
1579 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001580 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
Georgios Pinitas55186712018-01-08 17:37:12 +00001581
1582 // Perform pooling
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001583 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001584 {
1585 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001586 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001587 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001588 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1589 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001590
1591 const uint16x8_t data_u16 = vmovl_u8(data);
1592 vres = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
1593 }
1594
1595 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001596 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001597 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001598 uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001599 sres += data;
1600 }
1601 }
1602
1603 // Reduction
1604 const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
1605 sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
1606
1607 // Divide by scale
1608 res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
1609 }
1610 else
1611 {
1612 uint8x8_t vres = vdup_n_u8(0);
1613 res = 0;
1614
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001615 for(int y = 0; y < pool_size_y; ++y)
Georgios Pinitas55186712018-01-08 17:37:12 +00001616 {
1617 int x = 0;
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001618 for(; x <= (pool_size_x - 8); x += 8)
Georgios Pinitas55186712018-01-08 17:37:12 +00001619 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001620 const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
1621 (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001622 vres = vmax_u8(vres, data);
1623 }
1624
1625 // Leftover for loop
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001626 for(; x < pool_size_x; ++x)
Georgios Pinitas55186712018-01-08 17:37:12 +00001627 {
Michalis Spyroubd0e6122018-01-23 09:52:16 +00001628 const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
Georgios Pinitas55186712018-01-08 17:37:12 +00001629 res = std::max(res, data);
1630 }
1631 }
1632
1633 // Reduce max
1634 vres = vpmax_u8(vres, vres);
1635 vres = vpmax_u8(vres, vres);
1636 vres = vpmax_u8(vres, vres);
1637
1638 // Get max value
1639 res = std::max(res, vget_lane_u8(vres, 0));
1640 }
1641
1642 // Store result
1643 *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
1644 },
1645 input, output);
1646}
1647
Pablo Tello77e6c552018-12-04 15:33:49 +00001648void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001649{
1650 Iterator input(_input, window_input);
1651 Iterator output(_output, window);
1652
1653 const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
1654 const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
1655 const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
1656 const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
1657 const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
1658 const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
1659 int pool_stride_x = 0;
1660 int pool_stride_y = 0;
1661 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
1662 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1663 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1664
Georgios Pinitas283fc602018-11-09 10:46:43 +00001665 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
1666
Michalis Spyrou57dac842018-03-01 16:03:50 +00001667 execute_window_loop(window, [&](const Coordinates & id)
1668 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001669 const int idx_width = id.y() * pool_stride_x;
1670 const int idx_height = id.z() * pool_stride_y;
1671 const int pool_limit_y = pool_pad_top - idx_height;
1672 const int pool_limit_x = pool_pad_left - idx_width;
1673
1674 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1675 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1676 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1677 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1678
Michalis Spyrou57dac842018-03-01 16:03:50 +00001679 if(pooling_type != PoolingType::MAX)
1680 {
1681 uint32x4_t vres1 = vdupq_n_u32(0);
1682 uint32x4_t vres2 = vdupq_n_u32(0);
Michalis Spyrouced25572018-10-01 16:26:20 +01001683 uint32x4_t vres3 = vdupq_n_u32(0);
1684 uint32x4_t vres4 = vdupq_n_u32(0);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001685
1686 // Calculate scale
Pablo Tello77e6c552018-12-04 15:33:49 +00001687 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1688 pool_stride_y);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001689 const float32x4_t scale_v = vdupq_n_f32(scale);
1690
1691 // Perform pooling
Michalis Spyrouced25572018-10-01 16:26:20 +01001692 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001693 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001694 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001695 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001696 const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1697 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001698
Michalis Spyrouced25572018-10-01 16:26:20 +01001699 const uint16x8_t data_u16 = vmovl_u8(vget_low_u8(data));
1700 const uint16x8_t data2_u16 = vmovl_u8(vget_high_u8(data));
1701 vres1 = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
1702 vres2 = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
1703 vres3 = vaddq_u32(vres3, vmovl_u16(vget_low_u16(data2_u16)));
1704 vres4 = vaddq_u32(vres4, vmovl_u16(vget_high_u16(data2_u16)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001705 }
1706 }
Georgios Pinitas283fc602018-11-09 10:46:43 +00001707 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
1708 vres1 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres1), scale_v));
1709 vres2 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres2), scale_v));
1710 vres3 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres3), scale_v));
1711 vres4 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres4), scale_v));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001712
Michalis Spyrouced25572018-10-01 16:26:20 +01001713 uint8x8_t res1 = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
1714 uint8x8_t res2 = vmovn_u16(vcombine_u16(vmovn_u32(vres3), vmovn_u32(vres4)));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001715
1716 // Store result
Michalis Spyrouced25572018-10-01 16:26:20 +01001717 vst1_u8(output.ptr(), res1);
1718 vst1_u8(output.ptr() + 8, res2);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001719 }
1720 else
1721 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001722 uint8x16_t vres = vdupq_n_u8(0);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001723
Michalis Spyrouced25572018-10-01 16:26:20 +01001724 for(int y = pool_start_y; y < pool_end_y; ++y)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001725 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001726 for(int x = pool_start_x; x < pool_end_x; ++x)
Michalis Spyrou57dac842018-03-01 16:03:50 +00001727 {
Michalis Spyrouced25572018-10-01 16:26:20 +01001728 const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
1729 (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
1730 vres = vmaxq_u8(vres, data);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001731 }
1732 }
1733
1734 // Store result
Michalis Spyrouced25572018-10-01 16:26:20 +01001735 vst1q_u8(output.ptr(), vres);
Michalis Spyrou57dac842018-03-01 16:03:50 +00001736 }
1737 },
1738 input, output);
1739}
1740
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001741Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
1742{
1743 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
1744
1745 unsigned int pooled_w = 0;
1746 unsigned int pooled_h = 0;
1747 unsigned int num_elems_processed_per_iteration = 0;
1748 BorderSize border_size(0);
1749
Michalis Spyrou57dac842018-03-01 16:03:50 +00001750 const bool is_global_pooling = pool_info.is_global_pooling();
1751 unsigned int pool_size_x = 0;
1752 unsigned int pool_size_y = 0;
1753
1754 // Get data layout
1755 const DataLayout data_layout = input->data_layout();
1756 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
1757 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
1758
1759 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size().width;
1760 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size().height;
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001761
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001762 // Validate pool info before calling scaled_dimensions
1763 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001764
1765 // Check output dimensions
Michalis Spyrou57dac842018-03-01 16:03:50 +00001766 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
1767 input->dimension(idx_height),
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001768 pool_size_x,
1769 pool_size_y,
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001770 pool_info.pad_stride_info());
1771
Georgios Pinitas13d96e02018-08-23 11:20:23 +01001772 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h));
Isabella Gottardi7567f5f2018-01-30 15:26:00 +00001773 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
1774 pool_size_x, pool_size_y)
1775 .first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001776
1777 return Status{};
1778}
1779
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001780void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001781{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001782 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001783 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1784 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1785 ARM_COMPUTE_ERROR_ON(_func == nullptr);
1786
Pablo Tello77e6c552018-12-04 15:33:49 +00001787 const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
1788 const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
1789 const unsigned int pool_size = _pool_info.pool_size().width;
1790 const bool exclude_padding = _pool_info.exclude_padding();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001791
Michalis Spyrou57dac842018-03-01 16:03:50 +00001792 Window window_input(window);
1793 if(_input->info()->data_layout() == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001794 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001795 // Set step for input in x and y direction for the input
1796 unsigned int window_x_inc = 0;
1797 switch(_input->info()->data_type())
Pablo Tello0c34fe22017-06-26 17:17:42 +01001798 {
Michalis Spyrou57dac842018-03-01 16:03:50 +00001799 case DataType::QASYMM8:
1800 {
1801 window_x_inc = pool_stride_x;
1802 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
1803 {
1804 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
1805 }
1806 break;
1807 }
Pablo Tello77e6c552018-12-04 15:33:49 +00001808
Georgios Pinitas13d96e02018-08-23 11:20:23 +01001809 case DataType::F16:
Michalis Spyrou57dac842018-03-01 16:03:50 +00001810 case DataType::F32:
1811 {
1812 window_x_inc = pool_stride_x;
1813 break;
1814 }
1815 default:
1816 {
1817 ARM_COMPUTE_ERROR("Not supported");
1818 }
Georgios Pinitas55186712018-01-08 17:37:12 +00001819 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001820 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
1821 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001822 }
Michalis Spyrou57dac842018-03-01 16:03:50 +00001823 else
1824 {
Georgios Pinitascac13b12018-04-27 19:07:19 +01001825 window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
Michalis Spyrou57dac842018-03-01 16:03:50 +00001826 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
1827 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
1828 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001829
1830 // Run function
Pablo Tello77e6c552018-12-04 15:33:49 +00001831 (this->*_func)(window_input, window, _pool_info.pool_type(), exclude_padding);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001832}