Blame - src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp - ml/ComputeLibrary

inline uint8_t finalize_quantization(int32x4_t in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8_t min_u8, uint8_t max_u8)

108

{

109

const static int32x4_t zero_s32 = vdupq_n_s32(0);

110

const static int32x4_t sat_value_s32 = vdupq_n_s32(255);

111

112

// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar

113

in_s32 = vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier);

114

115

// Round to the nearest division by a power-of-two using result_shift_s32

116

in_s32 = rounding_divide_by_pow2(in_s32, result_shift);

117

118

// Add the offset terms

119

in_s32 = vaddq_s32(in_s32, result_offset_after_shift_s32);

120

121

// Saturate negative values

122

in_s32 = vmaxq_s32(in_s32, zero_s32);

123

in_s32 = vminq_s32(in_s32, sat_value_s32);

124

125

auto out_u8 = static_cast<uint8_t>(vgetq_lane_s32(in_s32, 0));

if(is_bounded_relu)

{

out_u8 = std::max(out_u8, min_u8);

130

out_u8 = std::min(out_u8, max_u8);

}

return out_u8;

}

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

135

} // namespace arm_compute

136

137

template <bool is_bounded_relu>

138

void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window)

139

{

140

const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);

141

const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min));

142

const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max));

143

144

ARM_COMPUTE_UNUSED(min_u8);

145

ARM_COMPUTE_UNUSED(max_u8);

146

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

147

const int window_step_x = 16;

148

const auto window_start_x = static_cast<int>(window.x().start());

149

const auto window_end_x = static_cast<int>(window.x().end());

150

151

Window win(window);

152

win.set(Window::DimX, Window::Dimension(0, 1, 1));

153

154

Iterator in(_input, win);

155

Iterator out(_output, win);

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

if(_bias != nullptr)

{

Window win_biases;

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

160

win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

161

win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));

162

163

Iterator bias(_bias, win_biases);

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

164

execute_window_loop(win, [&](const Coordinates & id)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

165

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

166

// Compute 16 elements per iteration

167

int x = window_start_x;

168

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

169

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

170

int32x4x4_t in_s32 =

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

171

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

172

{

173

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),

174

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),

175

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),

176

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)

177

}

178

};

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

179

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

180

const int32x4x4_t bias_s32 =

181

{

182

{

183

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),

184

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),

185

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),

186

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)

}

};

// Add the bias to GEMM's result

191

in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);

192

in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);

193

in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);

194

in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);

195

196

vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));

197

}

198

199

// Compute left-over elements

200

for(; x < window_end_x; ++x)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

201

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

202

const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);

203

int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

204

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

205

// Add bias

206

in_value += bias_value;

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

207

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

208

// Finalize and store the result

209

*(out.ptr() + x) = finalize_quantization<is_bounded_relu>(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min),

210

static_cast<uint8_t>(_max));

211

}

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

},

in, bias, out);

}

else

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

217

execute_window_loop(win, [&](const Coordinates & id)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

218

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

219

// Compute 16 elements per iteration

220

int x = window_start_x;

221

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

222

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

223

int32x4x4_t in_s32 =

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

224

{

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

225

{

226

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),

227

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),

228

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),

229

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)

230

}

231

};

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

232

Gian Marco

7f0f790

2017-12-07 09:26:56 +0000

[diff] [blame]

233

vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));

234

}

235

236

// Compute left-over elements

237

for(; x < window_end_x; ++x)

238

{

239

const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x);

240

241

// Finalize and store the result

242

*(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));

243

}

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

},

in, out);

}

}

NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()

250

: _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)

{

}

void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,

255

int result_offset_after_shift, int min, int max)

256

{

257

// Perform validate step

258

ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);

259

260

// Output auto inizialitation if not yet initialized

261

auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));

262

263

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),

264

(bias != nullptr) ? bias->info() : nullptr,

output->info(),

min,

max));

_input = input;

_bias = bias;

_output = output;

_result_fixedpoint_multiplier = result_fixedpoint_multiplier;

273

_result_shift = result_shift;

274

_result_offset_after_shift = result_offset_after_shift;

_min = min;

_max = max;

// Configure kernel window

279

auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());

280

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

281

INEKernel::configure(win_config.second);

282

283

// Check if we need to clamp the result using min and max

284

const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));

285

_func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<false>;

286

}

287

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

288

Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)

Gian Marco

58c5794

2017-11-28 09:10:03 +0000

[diff] [blame]

289

{

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

290

ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);