Blame - src/runtime/CL/functions/CLQLSTMLayer.cpp - ml/ComputeLibrary

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

44

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));

45

return Status{};

}

} // namespace

CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)

50

{

51

_memory_group = MemoryGroup(std::move(memory_manager));

52

}

53

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

54

void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

55

const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,

56

CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,

57

const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)

58

{

59

_memory_group.manage(mm_res);

60

_memory_group.manage(outstage_res);

61

62

mm_res->allocator()->init(mm_res_info);

63

outstage_res->allocator()->init(outstage_tensor_info);

64

65

// Configure matrix-multiplication

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

66

mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

67

68

// Configure output stage

69

quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

70

outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

71

mm_res->allocator()->allocate();

72

}

73

74

void CLQLSTMLayer::configure(const ICLTensor *input,

75

const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,

76

const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,

77

const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,

78

const ICLTensor *cell_state_in, const ICLTensor *output_state_in,

79

ICLTensor *cell_state_out, ICLTensor *output_state_out,

80

const LSTMParams<ICLTensor> &lstm_params)

81

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

82

configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

83

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,

84

cell_state_in, output_state_in, cell_state_out, output_state_out, lstm_params);

85

}

86

87

void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,

88

const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,

89

const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,

90

const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,

91

const ICLTensor *cell_state_in, const ICLTensor *output_state_in,

92

ICLTensor *cell_state_out, ICLTensor *output_state_out,

93

const LSTMParams<ICLTensor> &lstm_params)

94

{

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

95

ARM_COMPUTE_UNUSED(forget_gate_bias);

96

ARM_COMPUTE_UNUSED(cell_bias);

97

ARM_COMPUTE_UNUSED(output_gate_bias);

98

ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

99

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,

100

forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);

101

102

// Set lstm parameters

103

LSTMParams<ITensorInfo> lstm_params_info{};

104

build_lstm_params_tensor_info(lstm_params, &lstm_params_info);

105

106

// Validate

107

ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),

108

recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),

109

forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),

110

cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), lstm_params_info));

111

112

const int batch_size = input->info()->dimension(1);

113

const int num_units = input_to_output_weights->info()->dimension(1);

114

115

const UniformQuantizationInfo qinput = input->info()->quantization_info().uniform();

116

const UniformQuantizationInfo qcell_state_in = cell_state_in->info()->quantization_info().uniform();

117

const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();

118

119

_projection_bias = lstm_params.projection_bias();

120

_input_to_forget_weights = input_to_forget_weights;

121

_input_to_cell_weights = input_to_cell_weights;

122

_input_to_output_weights = input_to_output_weights;

123

_recurrent_to_forget_weights = recurrent_to_forget_weights;

124

_recurrent_to_cell_weights = recurrent_to_cell_weights;

125

_recurrent_to_output_weights = recurrent_to_output_weights;

126

_projection_weights = lstm_params.projection_weights();

127

128

_has_cifg = lstm_params.has_cifg_opt();

129

_has_projection = lstm_params.has_projection();

130

_has_peephole = lstm_params.has_peephole_opt();

131

132

// Calculate and decompose effective scales for optimizing matmul calculation

133

const int32_t cell_shift = log2(qcell_state_in.scale);

134

135

// Calculate quantized parameters for clipping.

136

int16_t quantized_cell_clip = 0;

137

if(lstm_params.cell_clip() > 0.0f)

138

{

139

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

140

}

141

_has_cell_clipping = quantized_cell_clip > 0;

142

143

// Precompute effective bias for optimizing the matmul computations.

144

if(!_has_cifg)

145

{

146

_input_to_input_weights = lstm_params.input_to_input_weights();

147

_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();

148

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

149

_input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

150

_recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

151

}

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

152

_input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

153

_recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

154

_input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

155

_recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

156

_input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));

157

_recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

158

if(_projection_bias != nullptr)

159

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

160

_projection_reduction.configure(compile_context, _projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true));

161

_projection_bias_add.configure(compile_context, ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

162

}

163

164

// Pre-transpose weights to be used in GEMM.

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

165

_transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);

166

_transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);

167

_transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);

168

_transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);

169

_transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);

170

_transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

171

if(!_has_cifg)

172

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

173

_transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);

174

_transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

175

}

176

if(_has_projection)

177

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

178

_transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

179

}

180

181

GEMMLowpOutputStageInfo gemmlowp_info;

182

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

183

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

184

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

185

gemmlowp_info.output_data_type = DataType::QSYMM16;

186

187

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

188

// Forget gate.

189

const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

190

const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

191

configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

192

input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,

193

&_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,

194

mm_out_info, forget_gate_outstage_info);

195

196

const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

197

configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

198

output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,

199

&_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,

200

mm_out_info, forget_gate_outstage_info);

201

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

202

_accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,

203

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

204

_input_to_forget_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_forget_res);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

209

_pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

210

_cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));

211

_memory_group.manage(&_cell_to_forget_outstage_res);

212

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

213

quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

214

_cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

215

_mul_cell_to_forget_res.allocator()->allocate();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

216

_accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,

217

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

218

_cell_to_forget_outstage_res.allocator()->allocate();

219

}

220

221

// Output quantization info of Sigmoid and Tanh activations

222

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

223

224

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

225

_memory_group.manage(&_forget_gate);

226

_forget_gate.allocator()->init(forget_gate_info);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

227

_forget_gate_sigmoid.configure(compile_context, &_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

228

_recurrent_to_forget_outstage_res.allocator()->allocate();

229

230

// Modulation gate.

231

const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

232

const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

233

configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

234

input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,

235

&_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,

236

mm_out_info, cell_outstage_info);

237

238

const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

239

configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

240

output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,

241

&_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,

242

mm_out_info, cell_outstage_info);

243

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

244

_accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,

245

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

246

_input_to_cell_outstage_res.allocator()->allocate();

247

248

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

249

_memory_group.manage(&_cell_gate);

250

_cell_gate.allocator()->init(cell_gate_info);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

251

_cell_gate_tanh.configure(compile_context, &_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

252

_recurrent_to_cell_outstage_res.allocator()->allocate();

253

254

// Input gate.

255

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

256

_input_gate.allocator()->init(input_gate_info);

257

_memory_group.manage(&_input_gate);

258

if(_has_cifg)

259

{

260

_ones.allocator()->init(*_forget_gate.info());

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

261

_input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

262

_ones.allocator()->allocate();

}

else

{

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

267

const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

268

configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

269

input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,

270

&_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,

271

mm_out_info, input_outstage_info);

272

273

const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

274

configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

275

input, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,

276

&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,

277

mm_out_info, input_outstage_info);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

278

_accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,

279

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

280

_input_to_input_outstage_res.allocator()->allocate();

if(_has_peephole)

{

_memory_group.manage(&_mul_cell_to_input_res);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

285

_pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

286

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

287

quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

288

_cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));

289

_memory_group.manage(&_cell_to_input_outstage_res);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

290

_cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

291

_mul_cell_to_input_res.allocator()->allocate();

292

_accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);

293

_cell_to_input_outstage_res.allocator()->allocate();

294

}

295

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

296

_input_gate_tanh.configure(compile_context, &_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

297

_recurrent_to_input_outstage_res.allocator()->allocate();

298

}

299

// Cell.

300

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

301

_pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

302

const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;

303

const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);

304

const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));

305

_memory_group.manage(&_mul_input_cell_res);

306

_mul_input_cell_res.allocator()->init(mul_input_cell_info);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

307

_pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

308

_cell_gate.allocator()->allocate();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

309

_add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

310

_mul_input_cell_res.allocator()->allocate();

311

_forget_gate.allocator()->allocate();

312

if(_has_cell_clipping)

313

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

314

_cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

315

}

316

// Output gate.

317

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

318

const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

319

configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

320

input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,

321

&_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,

322

mm_out_info, output_outstage_info);

323

324

const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

325

configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

326

output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,

327

&_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,

328

mm_out_info, output_outstage_info);

329

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

330

_accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,

331

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

332

_input_to_output_outstage_res.allocator()->allocate();

if(_has_peephole)

{

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

337

// Here we are not using the output stage because all operations are done in float

338

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

339

// quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);

340

_memory_group.manage(&_mul_cell_to_output_res);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

341

_pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

342

_accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res,

343

ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

344

_mul_cell_to_output_res.allocator()->allocate();

345

}

346

347

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

348

_memory_group.manage(&_output_gate);

349

_output_gate.allocator()->init(output_gate_info);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

350

_output_gate_sigmoid.configure(compile_context, &_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

351

_recurrent_to_output_outstage_res.allocator()->allocate();

352

353

// Hidden.

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

354

_hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

355

// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel

356

_memory_group.manage(&_hidden_mul_res);

357

const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);

358

_hidden_mul_res.allocator()->init(hidden_mul_res);

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

359

_pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

360

_output_gate.allocator()->allocate();

361

_input_gate.allocator()->allocate();

362

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

363

quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);

364

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

365

gemmlowp_info.output_data_type = output_state_in->info()->data_type();

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

366

_hidden_outstage.configure(compile_context, &_hidden_mul_res, nullptr, output_state_out, gemmlowp_info);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

367

_hidden_mul_res.allocator()->allocate();

// Projection.

if(_has_projection)

{

const TensorInfo projection_outstage_info(*output_state_out->info());

373

const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();

374

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

375

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

376

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

377

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

378

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

379

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

380

configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

381

output_state_out, &_projection_weights_transposed, &_projection_eff_bias,

382

&_mm_projection_res, &_projection_outstage_res, projection_scale,

383

mm_out_info, projection_outstage_info);

384

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

385

_accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE);

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

386

_projection_outstage_res.allocator()->allocate();

387

388

int8_t quantized_projection_clip{ 0 };

389

if(lstm_params.projection_clip() > 0.0f)

390

{

391

quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);

392

}

393

394

if(quantized_projection_clip > 0)

395

{

Manuel Bottini

2b84be5

2020-04-08 10:15:51 +0100

[diff] [blame^]

396

_projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,

397

quantized_projection_clip));

Michele Di Giorgio

1c1b3aa

2020-04-02 17:35:42 +0100

[diff] [blame]

398

_has_projection_clipping = true;

}

}

}

Status CLQLSTMLayer::validate(const ITensorInfo *input,

404

const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,

405

const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,

406

const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,

407

const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,

408

const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out,

409

const LSTMParams<ITensorInfo> &lstm_params)

410

{

411

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,

412

recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);

413

414

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);

415

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");

416

417

const unsigned int input_size = input->dimension(0);

418

const unsigned int batch_size = input->dimension(1);

419

const unsigned int num_units = input_to_output_weights->dimension(1);

420

const unsigned int output_size = recurrent_to_output_weights->dimension(0);

421

422

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);

423

ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);

424

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);

425

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);

426

ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);

427

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);

428

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);

429

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,

430

recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);

431

432

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);

433

ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);

434

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);

435

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(forget_gate_bias, 1, DataType::S32);

436

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, cell_bias, output_gate_bias);

437

438

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() != 2);

439

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(0) != num_units);

440

ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(1) != batch_size);

441

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(cell_state_in, 1, DataType::QSYMM16);

442

443

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() != 2);

444

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(0) != output_size);

445

ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(1) != batch_size);

446

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);

447

448

// Check whether peephole weights are all there or none

449

if(lstm_params.has_peephole_opt())

450

{

451

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

452

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

453

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);

454

ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);

455

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

456

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());

457

458

if(!lstm_params.has_cifg_opt())

459

{

460

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());

461

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

462

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());

}

}

const UniformQuantizationInfo qinput = input->quantization_info().uniform();

467

const UniformQuantizationInfo qcell_state_in = cell_state_in->quantization_info().uniform();

468

const UniformQuantizationInfo qoutput_state_in = output_state_in->quantization_info().uniform();

469

470

// Calculate and decompose effective scales for optimizing matmul calculation

471

const int32_t cell_shift = log2(qcell_state_in.scale);

472

ARM_COMPUTE_RETURN_ERROR_ON(cell_shift > -9);

473

474

// Calculate quantized parameters for clipping.

475

int16_t quantized_cell_clip = 0;

476

if(lstm_params.cell_clip() > 0.0f)

477

{

478

quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);

479

}

480

481

// Precompute effective bias for optimizing the matmul computations.

482

const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);

483

if(!lstm_params.has_cifg_opt())

484

{

485

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

486

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,

487

true)));

488

}

489

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

490

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

491

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

492

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

493

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));

494

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));

495

if(lstm_params.projection_bias() != nullptr)

496

{

497

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(),

498

true)));

499

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, lstm_params.projection_bias(), &eff_bias_info, &eff_bias_info, ConvertPolicy::SATURATE));

500

}

501

502

const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());

503

const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());

504

505

// Validate weights transpose

506

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));

507

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_cell_weights, &input_weights_transposed));

508

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_output_weights, &input_weights_transposed));

509

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));

510

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));

511

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));

512

if(!lstm_params.has_cifg_opt())

513

{

514

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));

515

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));

516

}

517

if(lstm_params.has_projection())

518

{

519

ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &recurrent_weights_transposed));

520

}

521

522

GEMMLowpOutputStageInfo gemmlowp_info;

523

gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;

524

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();

525

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();

526

gemmlowp_info.output_data_type = DataType::QSYMM16;

527

528

// Forget gate.

529

const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));

530

const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);

531

const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();

532

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info);

533

534

const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();

535

validate_mm(gemmlowp_info, input, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info);

536

537

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

538

539

if(lstm_params.has_peephole_opt())

540

{

541

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);

542

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,

543

RoundingPolicy::TO_ZERO));

544

const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();

545

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

546

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));

547

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));

548

}

549

550

// Output quantization info of Sigmoid and Tanh activations

551

const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);

552

553

const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

554

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

555

556

// Modulation gate.

557

const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));

558

const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();

559

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info);

560

561

const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();

562

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info);

563

564

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));

565

566

const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

567

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

568

569

// Input gate.

570

const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

571

if(lstm_params.has_cifg_opt())

572

{

573

ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");

574

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());

579

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());

580

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());

581

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());

582

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());

583

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());

584

585

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(input, lstm_params.input_to_input_weights(), nullptr, &mm_out_info));

586

const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));

587

const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();

588

validate_mm(gemmlowp_info, input, lstm_params.input_to_input_weights(), &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info);

589

590

const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();

591

validate_mm(gemmlowp_info, input, lstm_params.recurrent_to_input_weights(), &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info);

592

593

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

594

595

if(lstm_params.has_peephole_opt())

596

{

597

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_outstage_info, 1.f, ConvertPolicy::SATURATE,

598

RoundingPolicy::TO_ZERO));

599

const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();

600

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

601

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&input_outstage_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));

602

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));

603

}

604

605

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

606

}

607

// Cell.

608

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

609

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

610

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));

611

if(quantized_cell_clip > 0)

612

{

613

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,

614

quantized_cell_clip)));

615

}

616

// Output gate.

617

const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));

618

const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();

619

validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info);

620

621

const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();

622

validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info);

623

624

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

625

if(lstm_params.has_peephole_opt())

626

{

627

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);

628

// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel

629

// Here we are not using the output stage because all operations are done in float

630

// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();

631

// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

632

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,

633

RoundingPolicy::TO_ZERO));

634

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));

635

}

636

637

const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);

638

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));

639

640

// Hidden.

641

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));

642

const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);

643

ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));

644

const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);

645

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));

646

gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();

647

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, output_state_out, gemmlowp_info));

648

649

// Projection.

650

if(lstm_params.has_projection())

651

{

652

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());

653

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());

654

655

const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();

656

const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;

657

ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));

658

gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;

659

gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();

660

gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();

661

gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;

662

663

const TensorInfo projection_outstage_info(*output_state_out);

664

validate_mm(gemmlowp_info, output_state_out, &recurrent_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &projection_outstage_info);

665

666

ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));

667

668

int8_t quantized_projection_clip{ 0 };

669

if(lstm_params.projection_clip() > 0.0f)

670

{

671

quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);

672

}

673

674

if(quantized_projection_clip > 0)

675

{

676

ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,

677

quantized_projection_clip)));

}

}

if(cell_state_out->total_size() > 0)

682

{

683

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);

684

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);

685

}

686

687

if(output_state_out->total_size() > 0)

688

{

689

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);

690

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);

}

return Status{};

}

void CLQLSTMLayer::run()

{

prepare();

// Acquire all the temporaries

701

MemoryGroupResourceScope scope_mg(_memory_group);

702

703

// Forget gate.

704

_mm_input_to_forget.run();

705

_input_to_forget_outstage.run();

706

707

_mm_recurrent_to_forget.run();

708

_recurrent_to_forget_outstage.run();

709

CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);

714

_cell_to_forget_outstage.run();

715

CLScheduler::get().enqueue(_accumulate_cell_forget);

716

}

717

718

_forget_gate_sigmoid.run();

719

720

// Modulation gate.

721

_mm_input_to_cell.run();

722

_input_to_cell_outstage.run();

723

724

_mm_recurrent_to_cell.run();

725

_recurrent_to_cell_outstage.run();

726

CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);

727

728

_cell_gate_tanh.run();

// Input gate

if(_has_cifg)

{

CLScheduler::get().enqueue(_input_gate_sub);

}

else

{

_mm_input_to_input.run();

738

_input_to_input_outstage.run();

739

_mm_recurrent_to_input.run();

740

_recurrent_to_input_outstage.run();

741

CLScheduler::get().enqueue(_accumulate_input_recurrent_input);

if(_has_peephole)

{

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);

746

_cell_to_input_outstage.run();

747

CLScheduler::get().enqueue(_accumulate_cell_input);

748

}

749

750

_input_gate_tanh.run();

}

// Cell.

CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);

755

CLScheduler::get().enqueue(_pixelwise_mul_input_cell);

756

CLScheduler::get().enqueue(_add_forget_cell);

757

if(_has_cell_clipping)

{

_cell_clip.run();

}

// Output gate.

_mm_input_to_output.run();

764

_input_to_output_outstage.run();

765

_mm_recurrent_to_output.run();

766

_recurrent_to_output_outstage.run();

767

CLScheduler::get().enqueue(_accumulate_input_recurrent_output);

768

if(_has_peephole)

769

{

770

CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);

771

CLScheduler::get().enqueue(_accumulate_cell_to_output);

772

}

773

774

_output_gate_sigmoid.run();

// Hidden.

_hidden_tanh.run();

CLScheduler::get().enqueue(_pixelwise_mul_hidden);

779

_hidden_outstage.run();

// Projection.

if(_has_projection)

{

_mm_projection.run();

785

_projection_outstage.run();

786

CLScheduler::get().enqueue(_accumulate_projection);

787

if(_has_projection_clipping)

788

{

789

_projection_clip.run();

}

}

}

void CLQLSTMLayer::prepare()

{

if(!_is_prepared)

{

// Pre-transpose weights to be used in GEMM.

799

_input_to_forget_weights_transposed.allocator()->allocate();

800

_input_to_cell_weights_transposed.allocator()->allocate();

801

_input_to_output_weights_transposed.allocator()->allocate();

802

_recurrent_to_forget_weights_transposed.allocator()->allocate();

803

_recurrent_to_cell_weights_transposed.allocator()->allocate();

804

_recurrent_to_output_weights_transposed.allocator()->allocate();

805

_transpose_input_to_forget_weights.run();

806

_transpose_input_to_cell_weights.run();

807

_transpose_input_to_output_weights.run();

808

_transpose_recurrent_to_forget_weights.run();

809

_transpose_recurrent_to_cell_weights.run();

810

_transpose_recurrent_to_output_weights.run();

811

812

// Precompute effective biases

if(_has_cifg)

{

_ones.map(true);

std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);

_ones.unmap();

}

else

{

_input_to_input_eff_bias.allocator()->allocate();

822

_recurrent_to_input_eff_bias.allocator()->allocate();

823

CLScheduler::get().enqueue(_input_to_input_reduction);

824

CLScheduler::get().enqueue(_recurrent_to_input_reduction);

825

826

_input_to_input_weights_transposed.allocator()->allocate();

827

_recurrent_to_input_weights_transposed.allocator()->allocate();

828

_transpose_input_to_input_weights.run();

829

_transpose_recurrent_to_input_weights.run();

830

_input_to_input_weights->mark_as_unused();

831

_recurrent_to_input_weights->mark_as_unused();

832

}

833

_input_to_forget_eff_bias.allocator()->allocate();

834

_recurrent_to_forget_eff_bias.allocator()->allocate();

835

_input_to_cell_eff_bias.allocator()->allocate();

836

_recurrent_to_cell_eff_bias.allocator()->allocate();

837

_input_to_output_eff_bias.allocator()->allocate();

838

_recurrent_to_output_eff_bias.allocator()->allocate();

839

CLScheduler::get().enqueue(_input_to_forget_reduction);

840

CLScheduler::get().enqueue(_recurrent_to_forget_reduction);

841

CLScheduler::get().enqueue(_input_to_cell_reduction);

842

CLScheduler::get().enqueue(_recurrent_to_cell_reduction);

843

CLScheduler::get().enqueue(_input_to_output_reduction);

844

CLScheduler::get().enqueue(_recurrent_to_output_reduction);

if(_has_projection)

{

if(_projection_bias != nullptr)

849

{

850

_projection_eff_bias.allocator()->allocate();

851

CLScheduler::get().enqueue(_projection_reduction);

852

_projection_bias->mark_as_unused();

853

}

854

855

_projection_weights_transposed.allocator()->allocate();

856

_transpose_projection_weights.run();

857

_projection_weights->mark_as_unused();

858

}

859

860

// Mark weights as unused

861

_input_to_forget_weights->mark_as_unused();

862

_input_to_cell_weights->mark_as_unused();

863

_input_to_output_weights->mark_as_unused();

864

_recurrent_to_forget_weights->mark_as_unused();

865

_recurrent_to_cell_weights->mark_as_unused();

866

_recurrent_to_output_weights->mark_as_unused();

867

868

CLScheduler::get().queue().finish();

_is_prepared = true;

}

}

} // namespace arm_compute