blob: c6f7ca4fb2247cf72ba067d231a32da5f8503529 [file] [log] [blame]
Gian Marco58c57942017-11-28 09:10:03 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEAsymm.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
34#include "arm_compute/core/Window.h"
35
36#include <arm_neon.h>
37#include <cstddef>
38#include <cstdint>
39
40using namespace arm_compute;
41
42namespace
43{
44Error validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
45{
46 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
Gian Marco58c57942017-11-28 09:10:03 +000047 ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
48 ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
49
50 // Check biases if exist
51 if(bias != nullptr)
52 {
53 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
54 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
55 ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
56 }
Chunosov5124be52017-11-22 20:42:13 +070057
58 if(output->total_size() != 0)
59 {
60 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
61 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
62 }
63
Gian Marco58c57942017-11-28 09:10:03 +000064 return Error{};
65}
66
67std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
68{
69 constexpr unsigned int num_elems_processed_per_iteration = 16;
70
71 // Configure kernel window
72 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
73
74 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Gian Marco58c57942017-11-28 09:10:03 +000075
76 bool window_changed = update_window_and_padding(win,
Chunosov5124be52017-11-22 20:42:13 +070077 input_access);
78
79 if(output->total_size() != 0)
80 {
81 AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
82 window_changed = window_changed || update_window_and_padding(win, output_result_access);
83
84 output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
85 }
Gian Marco58c57942017-11-28 09:10:03 +000086
87 if(bias != nullptr)
88 {
89 AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
90 window_changed = window_changed || update_window_and_padding(win, bias_access);
91 }
92
Gian Marco58c57942017-11-28 09:10:03 +000093 Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
94 return std::make_pair(err, win);
95}
96
97template <bool is_bounded_relu>
98inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8,
99 uint8x16_t max_u8)
100{
101 const static int32x4_t zero_s32 = vdupq_n_s32(0);
102
103 // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
104 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
105 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
106 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
107 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
108
109 // Round to the nearest division by a power-of-two using result_shift_s32
110 in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
111 in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
112 in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
113 in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
114
115 // Add the offset terms
116 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
117 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
118 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
119 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
120
121 // Saturate negative values
122 in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
123 in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
124 in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
125 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
126
127 // Convert S32 to S16
128 const int16x8x2_t in_s16 =
129 {
130 {
131 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
132 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
133 }
134 };
135
136 // Convert S16 to U8
137 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
138
139 if(is_bounded_relu)
140 {
141 out_u8 = vmaxq_u8(out_u8, min_u8);
142 out_u8 = vminq_u8(out_u8, max_u8);
143 }
144
145 return out_u8;
146}
147} // namespace
148
149namespace arm_compute
150{
151class Coordinates;
152} // namespace arm_compute
153
154template <bool is_bounded_relu>
155void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window)
156{
157 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
158 const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min));
159 const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max));
160
161 ARM_COMPUTE_UNUSED(min_u8);
162 ARM_COMPUTE_UNUSED(max_u8);
163
164 Iterator in(_input, window);
165 Iterator out(_output, window);
166
167 if(_bias != nullptr)
168 {
169 Window win_biases;
170 win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
171 win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
172
173 Iterator bias(_bias, win_biases);
174 execute_window_loop(window, [&](const Coordinates & id)
175 {
176 int32x4x4_t in_s32 =
177 {
178 {
179 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
180 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
181 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
182 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
183 }
184 };
185
186 const int32x4x4_t bias_s32 =
187 {
188 {
189 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 0),
190 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 4),
191 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 8),
192 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 12)
193 }
194 };
195
196 // Add the bias to GEMM's result
197 in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
198 in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
199 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
200 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
201
202 vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
203 },
204 in, bias, out);
205 }
206 else
207 {
208 execute_window_loop(window, [&](const Coordinates & id)
209 {
210 int32x4x4_t in_s32 =
211 {
212 {
213 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
214 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
215 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
216 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
217 }
218 };
219
220 vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
221 },
222 in, out);
223 }
224}
225
226NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
227 : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)
228{
229}
230
231void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
232 int result_offset_after_shift, int min, int max)
233{
234 // Perform validate step
235 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
236
237 // Output auto inizialitation if not yet initialized
238 auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
239
240 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
241 (bias != nullptr) ? bias->info() : nullptr,
242 output->info(),
243 min,
244 max));
245
246 _input = input;
247 _bias = bias;
248 _output = output;
249 _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
250 _result_shift = result_shift;
251 _result_offset_after_shift = result_offset_after_shift;
252 _min = min;
253 _max = max;
254
255 // Configure kernel window
256 auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
257 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
258 INEKernel::configure(win_config.second);
259
260 // Check if we need to clamp the result using min and max
261 const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
262 _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<false>;
263}
264
265Error NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
266{
Chunosov5124be52017-11-22 20:42:13 +0700267 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Gian Marco58c57942017-11-28 09:10:03 +0000268 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
269 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
270 (bias != nullptr) ? bias->clone().get() : nullptr,
271 output->clone().get())
272 .first);
273
274 return Error{};
275}
276
277void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
278{
279 ARM_COMPUTE_UNUSED(info);
280 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
281 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
282
283 (this->*_func)(window);
284}