Blame - src/runtime/CL/functions/CLQLSTMLayer.cpp - ml/ComputeLibrary

2020-04-08 10:15:51 +0100

[diff] [blame]

54

void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

55

const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,

56

CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,

57

const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)

58

{

59

_memory_group.manage(mm_res);

60

_memory_group.manage(outstage_res);

61

62

mm_res->allocator()->init(mm_res_info);

63

outstage_res->allocator()->init(outstage_tensor_info);

64

65

// Configure matrix-multiplication

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

66

mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

67

68

// Configure output stage

69

quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

70

outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

71

mm_res->allocator()->allocate();

72

}

73

74

void CLQLSTMLayer::configure(const ICLTensor *input,

75

const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,

76

const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,

77

const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,

78

const ICLTensor *cell_state_in, const ICLTensor *output_state_in,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

79

ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

80

const LSTMParams<ICLTensor> &lstm_params)

81

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

82

configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

83

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

84

cell_state_in, output_state_in, cell_state_out, output, output_state_out, lstm_params);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

85

}

86

87

void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,

88

const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,

89

const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,

90

const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,

91

const ICLTensor *cell_state_in, const ICLTensor *output_state_in,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

92

ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

93

const LSTMParams<ICLTensor> &lstm_params)

94

{

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

95

ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

96

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

97

forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,

98

cell_state_out, output_state_out, output);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

99

100

// Set lstm parameters

101

LSTMParams<ITensorInfo> lstm_params_info{};

102

build_lstm_params_tensor_info(lstm_params, &lstm_params_info);

103

104

// Validate

105

ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),

106

recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),

107

forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

108

cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),

109

lstm_params_info));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

110

111

const int batch_size = input->info()->dimension(1);

112

const int num_units = input_to_output_weights->info()->dimension(1);

113

114

const UniformQuantizationInfo qinput = input->info()->quantization_info().uniform();

115

const UniformQuantizationInfo qcell_state_in = cell_state_in->info()->quantization_info().uniform();

116

const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();

117

118

_projection_bias = lstm_params.projection_bias();

119

_input_to_forget_weights = input_to_forget_weights;

120

_input_to_cell_weights = input_to_cell_weights;

121

_input_to_output_weights = input_to_output_weights;

122

_recurrent_to_forget_weights = recurrent_to_forget_weights;

123

_recurrent_to_cell_weights = recurrent_to_cell_weights;

124

_recurrent_to_output_weights = recurrent_to_output_weights;

125

_projection_weights = lstm_params.projection_weights();

126

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

127

// Layer normalization

128

_has_layer_norm = lstm_params.use_layer_norm();

129

if(_has_layer_norm)

130

{

131

set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);

132

set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);

133

set_layer_norm_weight(lstm_params.input_layer_norm_weights(), LayerNormGate::Input);

134

set_layer_norm_weight(lstm_params.output_layer_norm_weights(), LayerNormGate::Output);

135

136

set_layer_norm_bias(forget_gate_bias, LayerNormGate::Forget);

137

set_layer_norm_bias(cell_bias, LayerNormGate::Cell);

138

set_layer_norm_bias(lstm_params.input_gate_bias(), LayerNormGate::Input);

139

set_layer_norm_bias(output_gate_bias, LayerNormGate::Output);

140

}

141

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

142

_has_cifg = lstm_params.has_cifg_opt();

143

_has_projection = lstm_params.has_projection();

144

_has_peephole = lstm_params.has_peephole_opt();

145

146

// Calculate and decompose effective scales for optimizing matmul calculation

147

const int32_t cell_shift = log2(qcell_state_in.scale);

148

149

// Calculate quantized parameters for clipping.

150

int16_t quantized_cell_clip = 0;

151

if(lstm_params.cell_clip() > 0.0f)

152

{

153

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

154

}

155

_has_cell_clipping = quantized_cell_clip > 0;

156

157

// Precompute effective bias for optimizing the matmul computations.

158

if(!_has_cifg)

159

{

160

_input_to_input_weights = lstm_params.input_to_input_weights();

161

_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();

162

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

163

_input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

164

_recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

165

}

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

166

_input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

167

_recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

168

_input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

169

_recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

170

_input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

171

_recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

172

if(_projection_bias != nullptr)

173

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

174

_projection_reduction.configure(compile_context, _projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true));

175

_projection_bias_add.configure(compile_context, ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

176

}

177

178

// Pre-transpose weights to be used in GEMM.

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

179

_transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);

180

_transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);

181

_transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);

182

_transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);

183

_transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);

184

_transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

185

if(!_has_cifg)

186

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

187

_transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);

188

_transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

189

}

190

if(_has_projection)

191

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

192

_transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

193

}

194

195

GEMMLowpOutputStageInfo gemmlowp_info;

196

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

197

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

198

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

199

gemmlowp_info.output_data_type = DataType::QSYMM16;

200

201

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

202

// Forget gate.

203

const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

204

const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

205

configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

206

input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,

207

&_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,

208

mm_out_info, forget_gate_outstage_info);

209

210

const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

211

configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

212

output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,

213

&_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,

214

mm_out_info, forget_gate_outstage_info);

215

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

216

_accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,

217

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

218

_input_to_forget_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_forget_res);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

223

_pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

224

_cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));

225

_memory_group.manage(&_cell_to_forget_outstage_res);

226

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

227

quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

228

_cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

229

_mul_cell_to_forget_res.allocator()->allocate();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

230

_accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,

231

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

232

_cell_to_forget_outstage_res.allocator()->allocate();

233

}

234

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

235

CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;

if(_has_layer_norm)

{

configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);

240

_recurrent_to_forget_outstage_res.allocator()->allocate();

241

forget_activation_input = &get_layer_norm_output(LayerNormGate::Forget);

242

}

243

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

244

// Output quantization info of Sigmoid and Tanh activations

245

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

246

247

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

248

_memory_group.manage(&_forget_gate);

249

_forget_gate.allocator()->init(forget_gate_info);

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

250

_forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

251

forget_activation_input->allocator()->allocate();

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

252

253

// Modulation gate.

254

const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

255

const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

256

configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

257

input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,

258

&_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,

259

mm_out_info, cell_outstage_info);

260

261

const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

262

configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

263

output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,

264

&_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,

265

mm_out_info, cell_outstage_info);

266

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

267

_accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,

268

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

269

_input_to_cell_outstage_res.allocator()->allocate();

270

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

271

CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;

if(_has_layer_norm)

{

configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);

276

_recurrent_to_cell_outstage_res.allocator()->allocate();

277

cell_activation_input = &get_layer_norm_output(LayerNormGate::Cell);

278

}

279

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

280

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

281

_memory_group.manage(&_cell_gate);

282

_cell_gate.allocator()->init(cell_gate_info);

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

283

_cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

284

cell_activation_input->allocator()->allocate();

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

285

286

// Input gate.

287

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

288

_input_gate.allocator()->init(input_gate_info);

289

_memory_group.manage(&_input_gate);

290

if(_has_cifg)

291

{

292

_ones.allocator()->init(*_forget_gate.info());

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

293

_input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

294

_ones.allocator()->allocate();

}

else

{

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

299

const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

300

configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

301

input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,

302

&_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,

303

mm_out_info, input_outstage_info);

304

305

const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

306

configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

307

input, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,

308

&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,

309

mm_out_info, input_outstage_info);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

310

_accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,

311

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

312

_input_to_input_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_input_res);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

317

_pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

318

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

319

quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

320

_cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));

321

_memory_group.manage(&_cell_to_input_outstage_res);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

322

_cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

323

_mul_cell_to_input_res.allocator()->allocate();

324

_accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);

325

_cell_to_input_outstage_res.allocator()->allocate();

326

}

327

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

328

CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;

if(_has_layer_norm)

{

configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);

333

_recurrent_to_input_outstage_res.allocator()->allocate();

334

input_activation_input = &get_layer_norm_output(LayerNormGate::Input);

335

}

336

337

_input_gate_tanh.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

338

input_activation_input->allocator()->allocate();

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

339

}

340

// Cell.

341

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

342

_pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

343

const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;

344

const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);

345

const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));

346

_memory_group.manage(&_mul_input_cell_res);

347

_mul_input_cell_res.allocator()->init(mul_input_cell_info);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

348

_pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

349

_cell_gate.allocator()->allocate();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

350

_add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

351

_mul_input_cell_res.allocator()->allocate();

352

_forget_gate.allocator()->allocate();

353

if(_has_cell_clipping)

354

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

355

_cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

356

}

357

// Output gate.

358

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

359

const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

360

configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

361

input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,

362

&_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,

363

mm_out_info, output_outstage_info);

364

365

const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

366

configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

367

output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,

368

&_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,

369

mm_out_info, output_outstage_info);

370

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

371

_accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,

372

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

373

_input_to_output_outstage_res.allocator()->allocate();

if(_has_peephole)

{

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

378

// Here we are not using the output stage because all operations are done in float

379

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

380

// quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

381

_memory_group.manage(&_mul_cell_to_output_res);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

382

_pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

383

_accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res,

384

ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

385

_mul_cell_to_output_res.allocator()->allocate();

386

}

387

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

388

CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;

if(_has_layer_norm)

{

configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);

393

_recurrent_to_output_outstage_res.allocator()->allocate();

394

output_activation_input = &get_layer_norm_output(LayerNormGate::Output);

395

}

396

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

397

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

398

_memory_group.manage(&_output_gate);

399

_output_gate.allocator()->init(output_gate_info);

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

400

_output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

401

output_activation_input->allocator()->allocate();

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

402

403

// Hidden.

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

404

_hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

405

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

406

_memory_group.manage(&_hidden_mul_res);

407

const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);

408

_hidden_mul_res.allocator()->init(hidden_mul_res);

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

409

_pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

410

_output_gate.allocator()->allocate();

411

_input_gate.allocator()->allocate();

412

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

413

quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);

414

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

415

gemmlowp_info.output_data_type = output_state_in->info()->data_type();

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

416

_hidden_outstage.configure(compile_context, &_hidden_mul_res, nullptr, output_state_out, gemmlowp_info);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

417

_hidden_mul_res.allocator()->allocate();

// Projection.

if(_has_projection)

{

const TensorInfo projection_outstage_info(*output_state_out->info());

423

const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();

424

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

425

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

426

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

427

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

428

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

429

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

430

configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

431

output_state_out, &_projection_weights_transposed, &_projection_eff_bias,

432

&_mm_projection_res, &_projection_outstage_res, projection_scale,

433

mm_out_info, projection_outstage_info);

434

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

435

_accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

436

_projection_outstage_res.allocator()->allocate();

437

438

int8_t quantized_projection_clip{ 0 };

439

if(lstm_params.projection_clip() > 0.0f)

440

{

441

quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);

442

}

443

444

if(quantized_projection_clip > 0)

445

{

Manuel Bottini

2020-04-08 10:15:51 +0100

[diff] [blame]

446

_projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,

447

quantized_projection_clip));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

448

_has_projection_clipping = true;

449

}

450

}

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

451

452

// Copy output_state_out to output

453

_copy_output.configure(compile_context, output_state_out, output);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

454

}

455

456

Status CLQLSTMLayer::validate(const ITensorInfo *input,

457

const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,

458

const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,

459

const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,

460

const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

461

const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

462

const LSTMParams<ITensorInfo> &lstm_params)

463

{

464

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

465

recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,

466

cell_state_out, output_state_out, output);

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

467

468

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);

469

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");

470

471

const unsigned int input_size = input->dimension(0);

472

const unsigned int batch_size = input->dimension(1);

473

const unsigned int num_units = input_to_output_weights->dimension(1);

474

const unsigned int output_size = recurrent_to_output_weights->dimension(0);

475

476

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);

477

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);

478

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);

479

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);

480

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);

481

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);

482

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);

483

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

484

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);

485

486

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);

487

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);

488

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);

489

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(forget_gate_bias, 1, DataType::S32);

490

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, cell_bias, output_gate_bias);

491

492

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() != 2);

493

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(0) != num_units);

494

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(1) != batch_size);

495

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(cell_state_in, 1, DataType::QSYMM16);

496

497

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() != 2);

498

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(0) != output_size);

499

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(1) != batch_size);

500

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);

501

502

// Check whether peephole weights are all there or none

503

if(lstm_params.has_peephole_opt())

504

{

505

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

506

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

507

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);

508

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);

509

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

510

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

511

512

if(!lstm_params.has_cifg_opt())

513

{

514

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());

515

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

516

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

}

}

const UniformQuantizationInfo qinput = input->quantization_info().uniform();

521

const UniformQuantizationInfo qcell_state_in = cell_state_in->quantization_info().uniform();

522

const UniformQuantizationInfo qoutput_state_in = output_state_in->quantization_info().uniform();

523

524

// Calculate and decompose effective scales for optimizing matmul calculation

525

const int32_t cell_shift = log2(qcell_state_in.scale);

526

ARM_COMPUTE_RETURN_ERROR_ON(cell_shift > -9);

527

528

// Calculate quantized parameters for clipping.

529

int16_t quantized_cell_clip = 0;

530

if(lstm_params.cell_clip() > 0.0f)

531

{

532

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

533

}

534

535

// Precompute effective bias for optimizing the matmul computations.

536

const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);

537

if(!lstm_params.has_cifg_opt())

538

{

539

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

540

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,

541

true)));

542

}

543

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

544

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

545

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

546

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

547

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

548

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

549

if(lstm_params.projection_bias() != nullptr)

550

{

551

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(),

552

true)));

553

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, lstm_params.projection_bias(), &eff_bias_info, &eff_bias_info, ConvertPolicy::SATURATE));

554

}

555

556

const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());

557

const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());

558

559

// Validate weights transpose

560

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));

561

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_cell_weights, &input_weights_transposed));

562

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_output_weights, &input_weights_transposed));

563

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));

564

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));

565

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));

566

if(!lstm_params.has_cifg_opt())

567

{

568

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));

569

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));

570

}

571

if(lstm_params.has_projection())

572

{

573

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &recurrent_weights_transposed));

574

}

575

576

GEMMLowpOutputStageInfo gemmlowp_info;

577

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

578

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

579

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

580

gemmlowp_info.output_data_type = DataType::QSYMM16;

581

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

582

const bool has_layer_norm = lstm_params.use_layer_norm();

583

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

584

// Forget gate.

585

const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

586

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

587

const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

588

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info);

589

590

const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

591

validate_mm(gemmlowp_info, input, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info);

592

593

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

594

595

if(lstm_params.has_peephole_opt())

596

{

597

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

598

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,

599

RoundingPolicy::TO_ZERO));

600

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

601

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

602

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));

603

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

604

}

605

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

606

if(has_layer_norm)

607

{

608

const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();

609

const ITensorInfo *b_info = forget_gate_bias;

610

ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(forget_outstage_info, *w_info, *b_info));

611

}

612

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

613

// Output quantization info of Sigmoid and Tanh activations

614

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

615

616

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

617

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

618

619

// Modulation gate.

620

const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

621

const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

622

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info);

623

624

const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

625

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info);

626

627

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));

628

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

629

if(has_layer_norm)

630

{

631

const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();

632

const ITensorInfo *b_info = cell_bias;

633

ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));

634

}

635

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

636

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

637

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

638

639

// Input gate.

640

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

641

if(lstm_params.has_cifg_opt())

642

{

643

ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");

644

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());

649

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());

650

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());

651

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());

652

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());

653

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());

654

655

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(input, lstm_params.input_to_input_weights(), nullptr, &mm_out_info));

656

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

657

const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

658

validate_mm(gemmlowp_info, input, lstm_params.input_to_input_weights(), &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info);

659

660

const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

661

validate_mm(gemmlowp_info, input, lstm_params.recurrent_to_input_weights(), &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info);

662

663

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

664

665

if(lstm_params.has_peephole_opt())

666

{

667

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_outstage_info, 1.f, ConvertPolicy::SATURATE,

668

RoundingPolicy::TO_ZERO));

669

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

670

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

671

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&input_outstage_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));

672

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

673

}

674

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

675

if(has_layer_norm)

676

{

677

const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();

678

const ITensorInfo *b_info = lstm_params.input_gate_bias();

679

ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));

680

}

681

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

682

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

683

}

684

// Cell.

685

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

686

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

687

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));

688

if(quantized_cell_clip > 0)

689

{

690

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,

691

quantized_cell_clip)));

692

}

693

// Output gate.

694

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

695

const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

696

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info);

697

698

const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

699

validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info);

700

701

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

702

if(lstm_params.has_peephole_opt())

703

{

704

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);

705

// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel

706

// Here we are not using the output stage because all operations are done in float

707

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

708

// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

709

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,

710

RoundingPolicy::TO_ZERO));

711

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

712

}

713

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

714

if(has_layer_norm)

715

{

716

const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();

717

const ITensorInfo *b_info = output_gate_bias;

718

ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(output_outstage_info, *w_info, *b_info));

719

}

720

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

721

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

722

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

723

724

// Hidden.

725

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

726

const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);

727

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

728

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

729

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));

730

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

731

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, output_state_out, gemmlowp_info));

732

733

// Projection.

734

if(lstm_params.has_projection())

735

{

736

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());

737

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());

738

739

const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();

740

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

741

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

742

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

743

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

744

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

745

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

746

747

const TensorInfo projection_outstage_info(*output_state_out);

748

validate_mm(gemmlowp_info, output_state_out, &recurrent_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &projection_outstage_info);

749

750

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));

751

752

int8_t quantized_projection_clip{ 0 };

753

if(lstm_params.projection_clip() > 0.0f)

754

{

755

quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);

756

}

757

758

if(quantized_projection_clip > 0)

759

{

760

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,

761

quantized_projection_clip)));

}

}

if(cell_state_out->total_size() > 0)

766

{

767

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);

768

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);

769

}

770

771

if(output_state_out->total_size() > 0)

772

{

773

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);

774

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);

775

}

776

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

777

ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

return Status{};

}

void CLQLSTMLayer::run()

{

prepare();

// Acquire all the temporaries

786

MemoryGroupResourceScope scope_mg(_memory_group);

787

788

// Forget gate.

789

_mm_input_to_forget.run();

790

_input_to_forget_outstage.run();

791

792

_mm_recurrent_to_forget.run();

793

_recurrent_to_forget_outstage.run();

794

CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);

799

_cell_to_forget_outstage.run();

800

CLScheduler::get().enqueue(_accumulate_cell_forget);

801

}

802

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

803

if(_has_layer_norm)

804

{

805

CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));

806

}

807

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

808

_forget_gate_sigmoid.run();

809

810

// Modulation gate.

811

_mm_input_to_cell.run();

812

_input_to_cell_outstage.run();

813

814

_mm_recurrent_to_cell.run();

815

_recurrent_to_cell_outstage.run();

816

CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);

817

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

818

if(_has_layer_norm)

819

{

820

CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));

821

}

822

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

823

_cell_gate_tanh.run();

// Input gate

if(_has_cifg)

{

CLScheduler::get().enqueue(_input_gate_sub);

}

else

{

_mm_input_to_input.run();

833

_input_to_input_outstage.run();

834

_mm_recurrent_to_input.run();

835

_recurrent_to_input_outstage.run();

836

CLScheduler::get().enqueue(_accumulate_input_recurrent_input);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);

841

_cell_to_input_outstage.run();

842

CLScheduler::get().enqueue(_accumulate_cell_input);

843

}

844

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

845

if(_has_layer_norm)

846

{

847

CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));

848

}

849

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

850

_input_gate_tanh.run();

}

// Cell.

CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);

855

CLScheduler::get().enqueue(_pixelwise_mul_input_cell);

856

CLScheduler::get().enqueue(_add_forget_cell);

857

if(_has_cell_clipping)

{

_cell_clip.run();

}

// Output gate.

_mm_input_to_output.run();

864

_input_to_output_outstage.run();

865

_mm_recurrent_to_output.run();

866

_recurrent_to_output_outstage.run();

867

CLScheduler::get().enqueue(_accumulate_input_recurrent_output);

868

if(_has_peephole)

869

{

870

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);

871

CLScheduler::get().enqueue(_accumulate_cell_to_output);

872

}

873

Sheri Zhang

2020-04-21 13:10:24 +0100

[diff] [blame]

874

if(_has_layer_norm)

875

{

876

CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));

877

}

878

Michele Di Giorgio

2020-04-02 17:35:42 +0100

[diff] [blame]

879

_output_gate_sigmoid.run();

// Hidden.

_hidden_tanh.run();

CLScheduler::get().enqueue(_pixelwise_mul_hidden);

884

_hidden_outstage.run();

// Projection.

if(_has_projection)

{

_mm_projection.run();

890

_projection_outstage.run();

891

CLScheduler::get().enqueue(_accumulate_projection);

892

if(_has_projection_clipping)

893

{

894

_projection_clip.run();

895

}

896

}

Michele Di Giorgio

2020-05-11 16:17:51 +0100

[diff] [blame^]

897

898

// Copy output_state_out to output

899

CLScheduler::get().enqueue(_copy_output);

Michele Di Giorgio