blob: a16960a205f245da2744ed1a9a8d12a70fbb19ed [file] [log] [blame]
Sheri Zhang79144a62021-02-08 17:43:04 +00001/*
2 * Copyright (c) 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
25#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
26
27#include "arm_compute/core/Types.h"
28#include "arm_compute/core/utils/misc/Traits.h"
29#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NEFixedPoint.h"
31#include "src/core/NEON/NEMath.h"
32#include "src/core/NEON/wrapper/wrapper.h"
33#include <arm_neon.h>
34
35namespace arm_compute
36{
37namespace cpu
38{
39template <typename T>
40inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
41quantize(float val, const UniformQuantizationInfo &info)
42{
43 return quantize_qasymm8_signed(val, info);
44}
45
46template <typename T>
47inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
48quantize(float val, const UniformQuantizationInfo &info)
49{
50 return quantize_qasymm8(val, info);
51}
52
53template <typename T>
54inline T vcvtq_q32_f32(float32x4_t values);
55
56template <>
57inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
58{
59 return vcvtq_u32_f32(values);
60}
61
62template <>
63inline int32x4_t vcvtq_q32_f32(float32x4_t values)
64{
65 return vcvtq_s32_f32(values);
66}
67
68template <typename T>
69inline float32x4_t vcvtq_f32_q32(T values);
70
71template <>
72inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
73{
74 return vcvtq_f32_u32(values);
75}
76
77template <>
78inline float32x4_t vcvtq_f32_q32(int32x4_t values)
79{
80 return vcvtq_f32_s32(values);
81}
82
83template <typename Tout>
84inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
85
86template <>
87inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
88{
89 const float new_scale = quant_rescale / scale_pooling;
90 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
91}
92
93template <>
94inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
95{
96 const float new_scale = quant_rescale / scale_pooling;
97 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
98}
99
100template <typename Tin, typename Tout>
101inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
102
103template <>
104inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
105{
106 const float32x4x4_t acc =
107 {
108 {
109 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
110 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
111 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
112 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
113 }
114 };
115 return vquantize(acc, requant_qinfo);
116}
117
118template <>
119inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
120{
121 const float32x4x4_t acc =
122 {
123 {
124 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
125 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
126 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
127 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
128 }
129 };
130 return vquantize_signed(acc, requant_qinfo);
131}
132
133template <typename T>
134inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
135
136template <>
137inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
138{
139 const float32x4x2_t acc =
140 {
141 {
142 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
143 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
144 }
145 };
146 return vquantize(acc, requant_qinfo);
147}
148
149template <>
150inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
151{
152 const float32x4x2_t acc =
153 {
154 {
155 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
156 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
157 }
158 };
159 return vquantize_signed(acc, requant_qinfo);
160}
161
162inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
163 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
164{
165 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
166 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
167
168 int start_x = id[idx_width] * stride_x - pad_x;
169 int start_y = id[idx_height] * stride_y - pad_y;
170
171 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
172 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
173 if(exclude_padding)
174 {
175 start_x = std::max(0, start_x);
176 start_y = std::max(0, start_y);
177 }
178 return 1.f / ((end_y - start_y) * (end_x - start_x));
179}
180
181template <typename T>
182void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
183{
184 ARM_COMPUTE_UNUSED(dst1);
185
186 const int window_start_x = window.x().start();
187 const int window_end_x = window.x().end();
188 const int window_step_x = 16;
189 const int window_half_step_x = window_step_x / 2;
190
191 Window window_out = window;
192 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
193
194 Iterator in(src, window_src);
195 Iterator out(dst0, window_out);
196
197 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
198 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
199 using q16_t = typename wrapper::traits::promote_t<T>;
200 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
201 using q32_t = typename wrapper::traits::promote_t<q16_t>;
202 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
203
204 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
205 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
206 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
207 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
208 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
209 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
210
211 int pool_stride_x = 0;
212 int pool_stride_y = 0;
213 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
214 const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
215 const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
216
217 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
218 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
219 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
220
221 const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
222 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
223 // With a requantization performed in a single step there won't be uncertainties introduced
224 const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
225
226 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
227 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
228 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
229
230 execute_window_loop(window_out, [&](const Coordinates & id)
231 {
232 const int idx_width = id.y() * pool_stride_x;
233 const int idx_height = id.z() * pool_stride_y;
234 const int pool_limit_y = pool_pad_top - idx_height;
235 const int pool_limit_x = pool_pad_left - idx_width;
236
237 const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
238 const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
239 const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
240 const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
241
242 int x_off = window_start_x;
243 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
244 {
245 if(pool_info.pool_type != PoolingType::MAX)
246 {
247 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
248 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
249 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
250 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
251
252 // Calculate scale
253 const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
254 pool_stride_y);
255
256 // Perform pooling
257 for(int y = pool_start_y; y < pool_end_y; ++y)
258 {
259 for(int x = pool_start_x; x < pool_end_x; ++x)
260 {
261 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
262 (src->info()->strides_in_bytes().z())) + x_off);
263
264 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
265 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
266 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
267 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
268 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
269 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
270 }
271 }
272
273 if(src_qinfo != dst_qinfo)
274 {
275 const float32x4x4_t vres =
276 {
277 {
278 vcvtq_f32_q32(vres1),
279 vcvtq_f32_q32(vres2),
280 vcvtq_f32_q32(vres3),
281 vcvtq_f32_q32(vres4),
282 }
283 };
284 const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
285 // Store result
286 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
287 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
288 }
289 else
290 {
291 const float32x4_t scale_v = vdupq_n_f32(scale);
292 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
293 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
294 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
295 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
296 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
297
298 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
299 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
300 // Store result
301 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
302 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
303 }
304 }
305 else
306 {
307 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
308
309 for(int y = pool_start_y; y < pool_end_y; ++y)
310 {
311 for(int x = pool_start_x; x < pool_end_x; ++x)
312 {
313 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
314 (src->info()->strides_in_bytes().z())) + x_off);
315 vres = wrapper::vmax(vres, data);
316 }
317 }
318
319 // Store result
320 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
321 requant_qinfo) :
322 vres);
323 }
324 }
325
326 if(pool_info.pool_type == PoolingType::MAX)
327 {
328 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
329 {
330 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
331 for(int y = pool_start_y; y < pool_end_y; ++y)
332 {
333 for(int x = pool_start_x; x < pool_end_x; ++x)
334 {
335 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
336 (src->info()->strides_in_bytes().z())) + x_off);
337 vres = wrapper::vmax(vres, data);
338 }
339 }
340
341 // Store result
342 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
343 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
344 }
345 }
346
347 // Left-overs loop
348 for(; x_off < window_end_x; ++x_off)
349 {
350 if(pool_info.pool_type != PoolingType::MAX)
351 {
352 q32_t res = static_cast<q32_t>(0.f);
353
354 // Calculate scale
355 const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
356 pool_stride_y);
357
358 // Perform pooling
359 for(int y = pool_start_y; y < pool_end_y; ++y)
360 {
361 for(int x = pool_start_x; x < pool_end_x; ++x)
362 {
363 const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
364 (src->info()->strides_in_bytes().z())) + x_off);
365 res += data;
366 }
367 }
368
369 if(src_qinfo != dst_qinfo)
370 {
371 const float res_f = static_cast<float>(res);
372 const float new_scale = quant_rescale / scale;
373 const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
374
375 // Store result
376 *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
377 }
378 else
379 {
380 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
381 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
382
383 // Store result
384 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
385 }
386 }
387 else
388 {
389 T res = std::numeric_limits<T>::min();
390
391 for(int y = pool_start_y; y < pool_end_y; ++y)
392 {
393 for(int x = pool_start_x; x < pool_end_x; ++x)
394 {
395 const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
396 (src->info()->strides_in_bytes().z())) + x_off);
397 res = std::max(res, data);
398 }
399 }
400
401 // Store result
402 if(src_qinfo != dst_qinfo)
403 {
404 const float res_f = static_cast<float>(res);
405 *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
406 }
407 else
408 {
409 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
410 }
411 }
412 }
413
414 },
415 in, out);
416}
417
418#if defined(ENABLE_NCHW_KERNELS)
419template <typename T, typename TVec>
420inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
421 const int pool_size, const int upper_bound_w, const int upper_bound_h,
422 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
423{
424 int start_x = (id.x() + id_offset) * stride_x - pad_x;
425 int start_y = id.y() * stride_y - pad_y;
426 const int end_y = std::min(start_y + pool_size, upper_bound_h);
427 if(exclude_padding)
428 {
429 start_y = std::max(0, start_y);
430 }
431
432 std::array<T, 8> elems =
433 {
434 {
435 wrapper::vgetlane(v, 0),
436 wrapper::vgetlane(v, 1),
437 wrapper::vgetlane(v, 2),
438 wrapper::vgetlane(v, 3),
439 wrapper::vgetlane(v, 4),
440 wrapper::vgetlane(v, 5),
441 wrapper::vgetlane(v, 6),
442 wrapper::vgetlane(v, 7),
443 }
444 };
445
446 for(auto &el : elems)
447 {
448 int c_start_x = start_x;
449 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
450 if(exclude_padding)
451 {
452 c_start_x = std::max(0, c_start_x);
453 }
454 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
455 el *= scale;
456 start_x += step * stride_x;
457 }
458
459 v = wrapper::vsetlane(elems[0], v, 0);
460 v = wrapper::vsetlane(elems[1], v, 1);
461 v = wrapper::vsetlane(elems[2], v, 2);
462 v = wrapper::vsetlane(elems[3], v, 3);
463 v = wrapper::vsetlane(elems[4], v, 4);
464 v = wrapper::vsetlane(elems[5], v, 5);
465 v = wrapper::vsetlane(elems[6], v, 6);
466 v = wrapper::vsetlane(elems[7], v, 7);
467}
468
469template <typename T>
470void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
471{
472 ARM_COMPUTE_UNUSED(dst1);
473 Iterator in(src, window_src);
474 Iterator out(dst0, window);
475
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000476 /** SIMD vector types */
Sheri Zhang79144a62021-02-08 17:43:04 +0000477 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
478 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
479 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
480 using q16_t = typename wrapper::traits::promote_t<T>;
481 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
482 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
483 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
484
485 constexpr int pool_size = 2;
486 int pool_stride_x = 0;
487 int pool_stride_y = 0;
488 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
489 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
490 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
491 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
492 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
493 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
494 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
495
496 const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
497 const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
498
499 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
500
501 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
502 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
503 const bool have_different_qinfo = src_qinfo != dst_qinfo;
504
505 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
506 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
507 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
508
509 execute_window_loop(window, [&](const Coordinates & id)
510 {
511 const auto top_data = wrapper::vloadq(src_top_ptr + in.offset());
512 const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
513 q8x8_t lower_res = {};
514 q8x8_t upper_res = {};
515
516 if(pool_info.pool_type != PoolingType::MAX)
517 {
518 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
519 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
520
521 // Add rows
522 const q16x8x2_t vrsum =
523 {
524 {
525 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
526 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
527 }
528 };
529
530 // Pair-wise add row data
531 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
532 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
533
534 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
535
536 // Scale lower result
537 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
538 pool_size, upper_bound_w, upper_bound_h,
539 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
540 lower_res = wrapper::vmovn(res_lower);
541
542 // Compute upper result for stride_x == 1
543 if(pool_stride_x == 1)
544 {
545 // Shifted row sum
546 const q16x8x2_t vrsum_shifted =
547 {
548 {
549 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
550 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
551 }
552 };
553
554 // Pair-wise add shifted row
555 q16x8_t res_upper = wrapper::vcombine(
556 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
557 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
558
559 // Scale upper result
560 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
561 pool_size, upper_bound_w, upper_bound_h,
562 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
563 upper_res = wrapper::vmovn(res_upper);
564 }
565 }
566 else
567 {
568 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
569 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
570 if(pool_stride_x == 1)
571 {
572 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
573 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
574 }
575 }
576
577 if(have_different_qinfo)
578 {
579 const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
580 lower_res = wrapper::vgetlow(requantized_dst);
581 upper_res = wrapper::vgethigh(requantized_dst);
582 }
583
584 // Store result
585 if(pool_stride_x == 1)
586 {
587 const q8x8x2_t res = { { lower_res, upper_res } };
588 wrapper::vstore(reinterpret_cast<T *>(out.ptr()), res);
589 }
590 else
591 {
592 wrapper::vstore(reinterpret_cast<T *>(out.ptr()), lower_res);
593 }
594 },
595 in, out);
596}
597
598template <typename T>
599void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
600{
601 ARM_COMPUTE_UNUSED(dst1);
602 Iterator in(src, window_src);
603 Iterator out(dst0, window);
604
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000605 /** SIMD vector types */
Sheri Zhang79144a62021-02-08 17:43:04 +0000606 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
607 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
608 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
609 using q16_t = typename wrapper::traits::promote_t<T>;
610 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
611 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
612
613 constexpr int pool_size = 3;
614 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
615 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
616 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
617 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
618 int pool_stride_x = 0;
619 int pool_stride_y = 0;
620 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
621 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
622 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
623
624 const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
625 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
626
627 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
628 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
629 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
630
631 const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
632 const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
633 const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
634
635 execute_window_loop(window, [&](const Coordinates & id)
636 {
637 const auto top_data = wrapper::vloadq(src_top_ptr + in.offset());
638 const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset());
639 const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
640 q8x8_t fres = {};
641 q8x16_t fqres = {};
642
643 if(pool_info.pool_type == PoolingType::AVG)
644 {
645 // Convert data to u16
646 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
647 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
648 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
649
650 // Calculate row sums
651 const q16x8x2_t vrsum =
652 {
653 {
654 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
655 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
656 }
657 };
658 const q16x8x2_t vrsum_shifted_1 =
659 {
660 {
661 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
662 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
663 }
664 };
665 const q16x8x2_t vrsum_shifted_2 =
666 {
667 {
668 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
669 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
670 }
671 };
672 // Calculate final sum
673 q16x8x2_t final_sum =
674 {
675 {
676 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
677 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
678 }
679 };
680 if(pool_stride_x == 2)
681 {
682 q16x8_t res =
683 {
684 wrapper::vgetlane(final_sum.val[0], 0),
685 wrapper::vgetlane(final_sum.val[0], 2),
686 wrapper::vgetlane(final_sum.val[0], 4),
687 wrapper::vgetlane(final_sum.val[0], 6),
688 wrapper::vgetlane(final_sum.val[1], 0),
689 wrapper::vgetlane(final_sum.val[1], 2),
690 wrapper::vgetlane(final_sum.val[1], 4),
691 wrapper::vgetlane(final_sum.val[1], 6),
692 };
693
694 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
695 pool_size, upper_bound_w, upper_bound_h,
696 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
697 fres = wrapper::vmovn(res);
698 }
699 else
700 {
701 // Scale lower result
702 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
703 pool_size, upper_bound_w, upper_bound_h,
704 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
705 // Scale lower result
706 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
707 pool_size, upper_bound_w, upper_bound_h,
708 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
709 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
710 }
711 }
712 else
713 {
714 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
715 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
716 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
717 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
718
719 if(pool_stride_x == 2)
720 {
721 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
722 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
723 fres = wrapper::vtbl(table, lookup_val);
724 }
725 else
726 {
727 fqres = final_max;
728 }
729 }
730
731 // Store result
732 if(pool_stride_x == 1)
733 {
734 if(src_qinfo != dst_qinfo)
735 {
736 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
737 }
738 wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fqres);
739 }
740 else
741 {
742 if(src_qinfo != dst_qinfo)
743 {
744 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
745 }
746 wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fres);
747 }
748 },
749 in, out);
750}
751
752template <typename T>
753void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
754{
755 ARM_COMPUTE_UNUSED(dst1);
756 Iterator in(src, window_src);
757 Iterator out(dst0, window);
758
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000759 /** SIMD vector types */
Sheri Zhang79144a62021-02-08 17:43:04 +0000760 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
761 using q16_t = typename wrapper::traits::promote_t<T>;
762 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
763 using q32_t = typename wrapper::traits::promote_t<q16_t>;
764 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
765
766 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
767 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
768 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
769 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
770 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
771 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
772 int pool_stride_x = 0;
773 int pool_stride_y = 0;
774 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
775 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
776 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
777
778 const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
779 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
780
781 execute_window_loop(window, [&](const Coordinates & id)
782 {
783 T res = std::numeric_limits<T>::min();
784
785 if(pool_info.pool_type != PoolingType::MAX)
786 {
787 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
788 q32_t sres = 0;
789
790 // Calculate scale
791 const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
792 pool_stride_y);
793
794 // Perform pooling
795 for(int y = 0; y < pool_size_y; ++y)
796 {
797 int x = 0;
798 for(; x <= (pool_size_x - 8); x += 8)
799 {
800 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
801 (src->info()->strides_in_bytes().y())));
802
803 const q16x8_t data_q16 = wrapper::vmovl(data);
804 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
805 }
806
807 // Leftover for loop
808 for(; x < pool_size_x; ++x)
809 {
810 T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
811 (src->info()->strides_in_bytes().y())));
812 sres += data;
813 }
814 }
815
816 // Reduction
817 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
818 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
819
820 // Divide by scale
821 res = static_cast<T>(support::cpp11::round(sres * scale));
822 }
823 else
824 {
825 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
826
827 for(int y = 0; y < pool_size_y; ++y)
828 {
829 int x = 0;
830 for(; x <= (pool_size_x - 8); x += 8)
831 {
832 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
833 (src->info()->strides_in_bytes().y())));
834 vres = wrapper::vmax(vres, data);
835 }
836 // Leftover for loop
837 for(; x < pool_size_x; ++x)
838 {
839 const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
840 (src->info()->strides_in_bytes().y())));
841 res = std::max(res, data);
842 }
843 }
844
845 // Reduce max
846 vres = wrapper::vpmax(vres, vres);
847 vres = wrapper::vpmax(vres, vres);
848 vres = wrapper::vpmax(vres, vres);
849
850 // Get max value
851 res = std::max(res, wrapper::vgetlane(vres, 0));
852 }
853 // Store result
854 res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
855 *(reinterpret_cast<T *>(out.ptr())) = res;
856 },
857 in, out);
858}
859#endif /* defined(ENABLE_NCHW_KERNELS) */
860} // namespace cpu
861} // namespace arm_compute
862
Sheri Zhangac6499a2021-02-10 15:32:38 +0000863#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H