Blame - src/runtime/CL/functions/CLQLSTMLayer.cpp - ml/ComputeLibrary

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

44

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));

45

return Status{};

}

} // namespace

CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)

50

{

51

_memory_group = MemoryGroup(std::move(memory_manager));

52

}

53

54

void CLQLSTMLayer::configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,

55

const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,

56

CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,

57

const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)

58

{

59

_memory_group.manage(mm_res);

60

_memory_group.manage(outstage_res);

61

62

mm_res->allocator()->init(mm_res_info);

63

outstage_res->allocator()->init(outstage_tensor_info);

64

65

// Configure matrix-multiplication

66

mm.configure(mm_input, mm_weights, nullptr, mm_res);

67

68

// Configure output stage

69

quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

70

outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);

71

mm_res->allocator()->allocate();

72

}

73

74

void CLQLSTMLayer::configure(const ICLTensor *input,

75

const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,

76

const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,

77

const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,

78

const ICLTensor *cell_state_in, const ICLTensor *output_state_in,

79

ICLTensor *cell_state_out, ICLTensor *output_state_out,

80

const LSTMParams<ICLTensor> &lstm_params)

81

{

82

ARM_COMPUTE_UNUSED(forget_gate_bias);

83

ARM_COMPUTE_UNUSED(cell_bias);

84

ARM_COMPUTE_UNUSED(output_gate_bias);

85

ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

86

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,

87

forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);

88

89

// Set lstm parameters

90

LSTMParams<ITensorInfo> lstm_params_info{};

91

build_lstm_params_tensor_info(lstm_params, &lstm_params_info);

92

93

// Validate

94

ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),

95

recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),

96

forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),

97

cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), lstm_params_info));

98

99

const int batch_size = input->info()->dimension(1);

100

const int num_units = input_to_output_weights->info()->dimension(1);

101

102

const UniformQuantizationInfo qinput = input->info()->quantization_info().uniform();

103

const UniformQuantizationInfo qcell_state_in = cell_state_in->info()->quantization_info().uniform();

104

const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();

105

106

_projection_bias = lstm_params.projection_bias();

107

_input_to_forget_weights = input_to_forget_weights;

108

_input_to_cell_weights = input_to_cell_weights;

109

_input_to_output_weights = input_to_output_weights;

110

_recurrent_to_forget_weights = recurrent_to_forget_weights;

111

_recurrent_to_cell_weights = recurrent_to_cell_weights;

112

_recurrent_to_output_weights = recurrent_to_output_weights;

113

_projection_weights = lstm_params.projection_weights();

114

115

_has_cifg = lstm_params.has_cifg_opt();

116

_has_projection = lstm_params.has_projection();

117

_has_peephole = lstm_params.has_peephole_opt();

118

119

// Calculate and decompose effective scales for optimizing matmul calculation

120

const int32_t cell_shift = log2(qcell_state_in.scale);

121

122

// Calculate quantized parameters for clipping.

123

int16_t quantized_cell_clip = 0;

124

if(lstm_params.cell_clip() > 0.0f)

125

{

126

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

127

}

128

_has_cell_clipping = quantized_cell_clip > 0;

129

130

// Precompute effective bias for optimizing the matmul computations.

131

if(!_has_cifg)

132

{

133

_input_to_input_weights = lstm_params.input_to_input_weights();

134

_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();

135

136

_input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

137

_recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

138

}

139

_input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

140

_recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

141

_input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

142

_recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

143

_input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

144

_recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

145

if(_projection_bias != nullptr)

146

{

147

_projection_reduction.configure(_projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true));

148

_projection_bias_add.configure(ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE);

149

}

150

151

// Pre-transpose weights to be used in GEMM.

152

_transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);

153

_transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);

154

_transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);

155

_transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);

156

_transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);

157

_transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);

158

if(!_has_cifg)

159

{

160

_transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);

161

_transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);

}

if(_has_projection)

{

_transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);

166

}

167

168

GEMMLowpOutputStageInfo gemmlowp_info;

169

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

170

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

171

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

172

gemmlowp_info.output_data_type = DataType::QSYMM16;

173

174

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

175

// Forget gate.

176

const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

177

const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

178

configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,

179

input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,

180

&_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,

181

mm_out_info, forget_gate_outstage_info);

182

183

const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

184

configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,

185

output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,

186

&_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,

187

mm_out_info, forget_gate_outstage_info);

188

189

_accumulate_input_recurrent_forget.configure(ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);

190

_input_to_forget_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_forget_res);

195

_pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

196

_cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));

197

_memory_group.manage(&_cell_to_forget_outstage_res);

198

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

199

quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

200

_cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);

201

_mul_cell_to_forget_res.allocator()->allocate();

202

_accumulate_cell_forget.configure(ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);

203

_cell_to_forget_outstage_res.allocator()->allocate();

204

}

205

206

// Output quantization info of Sigmoid and Tanh activations

207

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

208

209

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

210

_memory_group.manage(&_forget_gate);

211

_forget_gate.allocator()->init(forget_gate_info);

212

_forget_gate_sigmoid.configure(&_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

213

_recurrent_to_forget_outstage_res.allocator()->allocate();

214

215

// Modulation gate.

216

const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

217

const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

218

configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,

219

input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,

220

&_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,

221

mm_out_info, cell_outstage_info);

222

223

const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

224

configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,

225

output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,

226

&_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,

227

mm_out_info, cell_outstage_info);

228

229

_accumulate_input_recurrent_modulation.configure(ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);

230

_input_to_cell_outstage_res.allocator()->allocate();

231

232

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

233

_memory_group.manage(&_cell_gate);

234

_cell_gate.allocator()->init(cell_gate_info);

235

_cell_gate_tanh.configure(&_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

236

_recurrent_to_cell_outstage_res.allocator()->allocate();

237

238

// Input gate.

239

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

240

_input_gate.allocator()->init(input_gate_info);

241

_memory_group.manage(&_input_gate);

242

if(_has_cifg)

243

{

244

_ones.allocator()->init(*_forget_gate.info());

245

_input_gate_sub.configure(ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);

246

_ones.allocator()->allocate();

}

else

{

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

251

const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

252

configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,

253

input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,

254

&_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,

255

mm_out_info, input_outstage_info);

256

257

const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

258

configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,

259

input, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,

260

&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,

261

mm_out_info, input_outstage_info);

262

_accumulate_input_recurrent_input.configure(ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);

263

_input_to_input_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_input_res);

268

_pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

269

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

270

quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

271

_cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));

272

_memory_group.manage(&_cell_to_input_outstage_res);

273

_cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);

274

_mul_cell_to_input_res.allocator()->allocate();

275

_accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);

276

_cell_to_input_outstage_res.allocator()->allocate();

277

}

278

279

_input_gate_tanh.configure(&_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

280

_recurrent_to_input_outstage_res.allocator()->allocate();

281

}

282

// Cell.

283

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

284

_pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

285

const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;

286

const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);

287

const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));

288

_memory_group.manage(&_mul_input_cell_res);

289

_mul_input_cell_res.allocator()->init(mul_input_cell_info);

290

_pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

291

_cell_gate.allocator()->allocate();

292

_add_forget_cell.configure(ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);

293

_mul_input_cell_res.allocator()->allocate();

294

_forget_gate.allocator()->allocate();

295

if(_has_cell_clipping)

296

{

297

_cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));

298

}

299

// Output gate.

300

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

301

const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

302

configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,

303

input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,

304

&_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,

305

mm_out_info, output_outstage_info);

306

307

const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

308

configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,

309

output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,

310

&_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,

311

mm_out_info, output_outstage_info);

312

313

_accumulate_input_recurrent_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);

314

_input_to_output_outstage_res.allocator()->allocate();

if(_has_peephole)

{

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

319

// Here we are not using the output stage because all operations are done in float

320

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

321

// quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

322

_memory_group.manage(&_mul_cell_to_output_res);

323

_pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

324

_accumulate_cell_to_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);

325

_mul_cell_to_output_res.allocator()->allocate();

326

}

327

328

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

329

_memory_group.manage(&_output_gate);

330

_output_gate.allocator()->init(output_gate_info);

331

_output_gate_sigmoid.configure(&_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

332

_recurrent_to_output_outstage_res.allocator()->allocate();

333

334

// Hidden.

335

_hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

336

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

337

_memory_group.manage(&_hidden_mul_res);

338

const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);

339

_hidden_mul_res.allocator()->init(hidden_mul_res);

340

_pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

341

_output_gate.allocator()->allocate();

342

_input_gate.allocator()->allocate();

343

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

344

quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);

345

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

346

gemmlowp_info.output_data_type = output_state_in->info()->data_type();

347

_hidden_outstage.configure(&_hidden_mul_res, nullptr, output_state_out, gemmlowp_info);

348

_hidden_mul_res.allocator()->allocate();

// Projection.

if(_has_projection)

{

const TensorInfo projection_outstage_info(*output_state_out->info());

354

const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();

355

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

356

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

357

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

358

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

359

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

360

361

configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,

362

output_state_out, &_projection_weights_transposed, &_projection_eff_bias,

363

&_mm_projection_res, &_projection_outstage_res, projection_scale,

364

mm_out_info, projection_outstage_info);

365

366

_accumulate_projection.configure(ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE);

367

_projection_outstage_res.allocator()->allocate();

368

369

int8_t quantized_projection_clip{ 0 };

370

if(lstm_params.projection_clip() > 0.0f)

371

{

372

quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);

373

}

374

375

if(quantized_projection_clip > 0)

376

{

377

_projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));

378

_has_projection_clipping = true;

}

}

}

Status CLQLSTMLayer::validate(const ITensorInfo *input,

384

const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,

385

const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,

386

const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,

387

const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,

388

const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out,

389

const LSTMParams<ITensorInfo> &lstm_params)

390

{

391

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,

392

recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);

393

394

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);

395

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");

396

397

const unsigned int input_size = input->dimension(0);

398

const unsigned int batch_size = input->dimension(1);

399

const unsigned int num_units = input_to_output_weights->dimension(1);

400

const unsigned int output_size = recurrent_to_output_weights->dimension(0);

401

402

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);

403

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);

404

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);

405

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);

406

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);

407

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);

408

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);

409

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

410

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);

411

412

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);

413

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);

414

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);

415

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(forget_gate_bias, 1, DataType::S32);

416

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, cell_bias, output_gate_bias);

417

418

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() != 2);

419

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(0) != num_units);

420

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(1) != batch_size);

421

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(cell_state_in, 1, DataType::QSYMM16);

422

423

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() != 2);

424

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(0) != output_size);

425

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(1) != batch_size);

426

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);

427

428

// Check whether peephole weights are all there or none

429

if(lstm_params.has_peephole_opt())

430

{

431

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

432

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

433

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);

434

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);

435

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

436

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

437

438

if(!lstm_params.has_cifg_opt())

439

{

440

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());

441

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

442

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

}

}

const UniformQuantizationInfo qinput = input->quantization_info().uniform();

447

const UniformQuantizationInfo qcell_state_in = cell_state_in->quantization_info().uniform();

448

const UniformQuantizationInfo qoutput_state_in = output_state_in->quantization_info().uniform();

449

450

// Calculate and decompose effective scales for optimizing matmul calculation

451

const int32_t cell_shift = log2(qcell_state_in.scale);

452

ARM_COMPUTE_RETURN_ERROR_ON(cell_shift > -9);

453

454

// Calculate quantized parameters for clipping.

455

int16_t quantized_cell_clip = 0;

456

if(lstm_params.cell_clip() > 0.0f)

457

{

458

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

459

}

460

461

// Precompute effective bias for optimizing the matmul computations.

462

const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);

463

if(!lstm_params.has_cifg_opt())

464

{

465

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

466

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,

467

true)));

468

}

469

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

470

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

471

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

472

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

473

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

474

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

475

if(lstm_params.projection_bias() != nullptr)

476

{

477

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(),

478

true)));

479

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, lstm_params.projection_bias(), &eff_bias_info, &eff_bias_info, ConvertPolicy::SATURATE));

480

}

481

482

const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());

483

const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());

484

485

// Validate weights transpose

486

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));

487

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_cell_weights, &input_weights_transposed));

488

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_output_weights, &input_weights_transposed));

489

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));

490

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));

491

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));

492

if(!lstm_params.has_cifg_opt())

493

{

494

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));

495

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));

496

}

497

if(lstm_params.has_projection())

498

{

499

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &recurrent_weights_transposed));

500

}

501

502

GEMMLowpOutputStageInfo gemmlowp_info;

503

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

504

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

505

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

506

gemmlowp_info.output_data_type = DataType::QSYMM16;

507

508

// Forget gate.

509

const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

510

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

511

const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

512

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info);

513

514

const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

515

validate_mm(gemmlowp_info, input, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info);

516

517

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

518

519

if(lstm_params.has_peephole_opt())

520

{

521

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

522

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,

523

RoundingPolicy::TO_ZERO));

524

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

525

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

526

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));

527

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

528

}

529

530

// Output quantization info of Sigmoid and Tanh activations

531

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

532

533

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

534

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

535

536

// Modulation gate.

537

const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

538

const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

539

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info);

540

541

const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

542

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info);

543

544

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));

545

546

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

547

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

548

549

// Input gate.

550

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

551

if(lstm_params.has_cifg_opt())

552

{

553

ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");

554

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());

559

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());

560

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());

561

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());

562

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());

563

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());

564

565

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(input, lstm_params.input_to_input_weights(), nullptr, &mm_out_info));

566

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

567

const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

568

validate_mm(gemmlowp_info, input, lstm_params.input_to_input_weights(), &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info);

569

570

const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

571

validate_mm(gemmlowp_info, input, lstm_params.recurrent_to_input_weights(), &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info);

572

573

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

574

575

if(lstm_params.has_peephole_opt())

576

{

577

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_outstage_info, 1.f, ConvertPolicy::SATURATE,

578

RoundingPolicy::TO_ZERO));

579

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

580

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

581

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&input_outstage_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));

582

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

583

}

584

585

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

586

}

587

// Cell.

588

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

589

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

590

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));

591

if(quantized_cell_clip > 0)

592

{

593

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,

594

quantized_cell_clip)));

595

}

596

// Output gate.

597

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

598

const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

599

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info);

600

601

const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

602

validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info);

603

604

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

605

if(lstm_params.has_peephole_opt())

606

{

607

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);

608

// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel

609

// Here we are not using the output stage because all operations are done in float

610

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

611

// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

612

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,

613

RoundingPolicy::TO_ZERO));

614

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

615

}

616

617

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

618

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

619

620

// Hidden.

621

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

622

const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);

623

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

624

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

625

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));

626

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

627

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, output_state_out, gemmlowp_info));

628

629

// Projection.

630

if(lstm_params.has_projection())

631

{

632

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());

633

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());

634

635

const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();

636

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

637

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

638

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

639

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

640

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

641

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

642

643

const TensorInfo projection_outstage_info(*output_state_out);

644

validate_mm(gemmlowp_info, output_state_out, &recurrent_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &projection_outstage_info);

645

646

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));

647

648

int8_t quantized_projection_clip{ 0 };

649

if(lstm_params.projection_clip() > 0.0f)

650

{

651

quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);

652

}

653

654

if(quantized_projection_clip > 0)

655

{

656

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,

657

quantized_projection_clip)));

}

}

if(cell_state_out->total_size() > 0)

662

{

663

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);

664

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);

665

}

666

667

if(output_state_out->total_size() > 0)

668

{

669

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);

670

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);

}

return Status{};

}

void CLQLSTMLayer::run()

{

prepare();

// Acquire all the temporaries

681

MemoryGroupResourceScope scope_mg(_memory_group);

682

683

// Forget gate.

684

_mm_input_to_forget.run();

685

_input_to_forget_outstage.run();

686

687

_mm_recurrent_to_forget.run();

688

_recurrent_to_forget_outstage.run();

689

CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);

694

_cell_to_forget_outstage.run();

695

CLScheduler::get().enqueue(_accumulate_cell_forget);

696

}

697

698

_forget_gate_sigmoid.run();

699

700

// Modulation gate.

701

_mm_input_to_cell.run();

702

_input_to_cell_outstage.run();

703

704

_mm_recurrent_to_cell.run();

705

_recurrent_to_cell_outstage.run();

706

CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);

707

708

_cell_gate_tanh.run();

// Input gate

if(_has_cifg)

{

CLScheduler::get().enqueue(_input_gate_sub);

}

else

{

_mm_input_to_input.run();

718

_input_to_input_outstage.run();

719

_mm_recurrent_to_input.run();

720

_recurrent_to_input_outstage.run();

721

CLScheduler::get().enqueue(_accumulate_input_recurrent_input);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);

726

_cell_to_input_outstage.run();

727

CLScheduler::get().enqueue(_accumulate_cell_input);

728

}

729

730

_input_gate_tanh.run();

}

// Cell.

CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);

735

CLScheduler::get().enqueue(_pixelwise_mul_input_cell);

736

CLScheduler::get().enqueue(_add_forget_cell);

737

if(_has_cell_clipping)

{

_cell_clip.run();

}

// Output gate.

_mm_input_to_output.run();

744

_input_to_output_outstage.run();

745

_mm_recurrent_to_output.run();

746

_recurrent_to_output_outstage.run();

747

CLScheduler::get().enqueue(_accumulate_input_recurrent_output);

748

if(_has_peephole)

749

{

750

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);

751

CLScheduler::get().enqueue(_accumulate_cell_to_output);

752

}

753

754

_output_gate_sigmoid.run();

// Hidden.

_hidden_tanh.run();

CLScheduler::get().enqueue(_pixelwise_mul_hidden);

759

_hidden_outstage.run();

// Projection.

if(_has_projection)

{

_mm_projection.run();

765

_projection_outstage.run();

766

CLScheduler::get().enqueue(_accumulate_projection);

767

if(_has_projection_clipping)

768

{

769

_projection_clip.run();

}

}

}

void CLQLSTMLayer::prepare()

{

if(!_is_prepared)

{

// Pre-transpose weights to be used in GEMM.

779

_input_to_forget_weights_transposed.allocator()->allocate();

780

_input_to_cell_weights_transposed.allocator()->allocate();

781

_input_to_output_weights_transposed.allocator()->allocate();

782

_recurrent_to_forget_weights_transposed.allocator()->allocate();

783

_recurrent_to_cell_weights_transposed.allocator()->allocate();

784

_recurrent_to_output_weights_transposed.allocator()->allocate();

785

_transpose_input_to_forget_weights.run();

786

_transpose_input_to_cell_weights.run();

787

_transpose_input_to_output_weights.run();

788

_transpose_recurrent_to_forget_weights.run();

789

_transpose_recurrent_to_cell_weights.run();

790

_transpose_recurrent_to_output_weights.run();

791

792

// Precompute effective biases

if(_has_cifg)

{

_ones.map(true);

std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);

_ones.unmap();

}

else

{

_input_to_input_eff_bias.allocator()->allocate();

802

_recurrent_to_input_eff_bias.allocator()->allocate();

803

CLScheduler::get().enqueue(_input_to_input_reduction);

804

CLScheduler::get().enqueue(_recurrent_to_input_reduction);

805

806

_input_to_input_weights_transposed.allocator()->allocate();

807

_recurrent_to_input_weights_transposed.allocator()->allocate();

808

_transpose_input_to_input_weights.run();

809

_transpose_recurrent_to_input_weights.run();

810

_input_to_input_weights->mark_as_unused();

811

_recurrent_to_input_weights->mark_as_unused();

812

}

813

_input_to_forget_eff_bias.allocator()->allocate();

814

_recurrent_to_forget_eff_bias.allocator()->allocate();

815

_input_to_cell_eff_bias.allocator()->allocate();

816

_recurrent_to_cell_eff_bias.allocator()->allocate();

817

_input_to_output_eff_bias.allocator()->allocate();

818

_recurrent_to_output_eff_bias.allocator()->allocate();

819

CLScheduler::get().enqueue(_input_to_forget_reduction);

820

CLScheduler::get().enqueue(_recurrent_to_forget_reduction);

821

CLScheduler::get().enqueue(_input_to_cell_reduction);

822

CLScheduler::get().enqueue(_recurrent_to_cell_reduction);

823

CLScheduler::get().enqueue(_input_to_output_reduction);

824

CLScheduler::get().enqueue(_recurrent_to_output_reduction);

if(_has_projection)

{

if(_projection_bias != nullptr)

829

{

830

_projection_eff_bias.allocator()->allocate();

831

CLScheduler::get().enqueue(_projection_reduction);

832

_projection_bias->mark_as_unused();

833

}

834

835

_projection_weights_transposed.allocator()->allocate();

836

_transpose_projection_weights.run();

837

_projection_weights->mark_as_unused();

838

}

839

840

// Mark weights as unused

841

_input_to_forget_weights->mark_as_unused();

842

_input_to_cell_weights->mark_as_unused();

843

_input_to_output_weights->mark_as_unused();

844

_recurrent_to_forget_weights->mark_as_unused();

845

_recurrent_to_cell_weights->mark_as_unused();

846

_recurrent_to_output_weights->mark_as_unused();

847

848

CLScheduler::get().queue().finish();

_is_prepared = true;

}

}

} // namespace arm_compute