Blame - src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp - ml/ComputeLibrary

2017-11-28 09:10:03 +0000

[diff] [blame]

137

} // namespace arm_compute

138

139

template <bool is_bounded_relu>

140

void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window)

141

{

142

const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);

143

const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min));

144

const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max));

145

146

ARM_COMPUTE_UNUSED(min_u8);

147

ARM_COMPUTE_UNUSED(max_u8);

148

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

149

const int window_step_x = 16;

150

const auto window_start_x = static_cast<int>(window.x().start());

151

const auto window_end_x = static_cast<int>(window.x().end());

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

152

Georgios Pinitas

2018-09-18 18:38:37 +0100

[diff] [blame]

153

Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);

154

win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

155

Georgios Pinitas

2018-09-18 18:38:37 +0100

[diff] [blame]

156

Iterator in(_input, win_collapsed);

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

157

Iterator out(_output, win_collapsed);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

158

if(_bias != nullptr)

159

{

160

Window win_biases;

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

161

win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

162

win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));

163

164

Iterator bias(_bias, win_biases);

Georgios Pinitas

2018-09-18 18:38:37 +0100

[diff] [blame]

165

execute_window_loop(win_collapsed, [&](const Coordinates & id)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

166

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

167

// Compute 16 elements per iteration

168

int x = window_start_x;

169

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

170

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

171

int32x4x4_t in_s32 =

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

172

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

173

{

174

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),

175

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),

176

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),

177

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)

178

}

179

};

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

180

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

181

const int32x4x4_t bias_s32 =

182

{

183

{

184

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),

185

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),

186

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),

187

vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)

}

};

// Add the bias to GEMM's result

192

in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);

193

in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);

194

in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);

195

in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);

196

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

197

vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

198

}

199

200

// Compute left-over elements

201

for(; x < window_end_x; ++x)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

202

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

203

const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);

204

int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

205

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

206

// Add bias

207

in_value += bias_value;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

208

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

209

// Finalize and store the result

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

210

*(out.ptr() + x) = finalize_quantization<is_bounded_relu>(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min),

211

static_cast<uint8_t>(_max));

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

212

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

213

},

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

214

in, out, bias);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

215

}

216

else

217

{

Georgios Pinitas

2018-09-18 18:38:37 +0100

[diff] [blame]

218

execute_window_loop(win_collapsed, [&](const Coordinates & id)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

219

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

220

// Compute 16 elements per iteration

221

int x = window_start_x;

222

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

223

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

224

int32x4x4_t in_s32 =

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

225

{

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

226

{

227

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),

228

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),

229

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),

230

vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)

231

}

232

};

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

233

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

234

vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

235

}

236

237

// Compute left-over elements

238

for(; x < window_end_x; ++x)

239

{

240

const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x);

241

242

// Finalize and store the result

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

243

*(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));

Gian Marco

2017-12-07 09:26:56 +0000

[diff] [blame]

244

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

245

},

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

246

in, out);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

}

}

NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

251

: _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

{

}

void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

256

int result_offset_after_shift, int min, int max)

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

257

{

258

// Perform validate step

259

ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

260

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

_input = input;

_bias = bias;

_output = output;

_result_fixedpoint_multiplier = result_fixedpoint_multiplier;

266

_result_shift = result_shift;

267

_result_offset_after_shift = result_offset_after_shift;

_min = min;

_max = max;

// Configure kernel window

272

auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());

273

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

274

INEKernel::configure(win_config.second);

275

276

// Check if we need to clamp the result using min and max

277

const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));

278

_func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<false>;

279

}

280

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

281

Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)

Gian Marco