Blame - src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp - ml/armnn

2022-03-03 15:13:23 +0000

[diff] [blame]

1

//

Narumol Prangnawarat

2023-05-29 15:54:57 +0100

[diff] [blame]

2

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

3

// SPDX-License-Identifier: MIT

4

//

5

6

#include "NeonUnidirectionalSequenceLstmFloatWorkload.hpp"

7

#include "NeonWorkloadUtils.hpp"

8

9

#include <aclCommon/ArmComputeUtils.hpp>

10

#include <aclCommon/ArmComputeTensorUtils.hpp>

11

12

#include <armnn/utility/NumericCast.hpp>

13

#include <armnnUtils/Permute.hpp>

14

#include <neon/test/NeonWorkloadFactoryHelper.hpp>

15

#include <backendsCommon/WorkloadUtils.hpp>

16

17

#include "neon/NeonTensorHandle.hpp"

namespace

{

unsigned int CalcAclAxis(unsigned int numDimensions, unsigned int axis)

22

{

23

return (numDimensions - axis) - 1;

}

} //namespace

namespace armnn

{

using namespace armcomputetensorutils;

30

31

NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloatWorkload

32

(const UnidirectionalSequenceLstmQueueDescriptor& descriptor, const WorkloadInfo& info)

33

: FloatWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)

34

{

35

// Report Profiling Details

36

ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonUnidirectionalSequenceLstmFloatWorkload_Construct",

37

descriptor.m_Parameters,

info,

GetGuid());

const arm_compute::ITensor& input = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

42

arm_compute::ITensor& output = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[2])->GetTensor();

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

43

44

TensorInfo inputInfo = info.m_InputTensorInfos[0];

45

TensorInfo outputInfo = info.m_OutputTensorInfos[0];

46

47

arm_compute::DataType armComputeDataType = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetDataType();

48

armnn::DataType armnnDataType = GetArmNNDataType(armComputeDataType);

49

50

TensorShape inputLayerShape = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetShape();

51

TensorShape cellStateLayerShape = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[2])->GetShape();

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

52

TensorShape outputLayerShape = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[2])->GetShape();

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

53

54

unsigned int maxTime = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1];

55

unsigned int batchSize = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0];

56

unsigned int inputSize = inputLayerShape[2];

57

unsigned int outputSize = outputLayerShape[2];

58

unsigned int numUnits = cellStateLayerShape[1];

59

60

const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize});

61

const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize});

62

63

//

64

// Permute: performed if Unidirectional Sequence Layer inputs/outputs are in batch major format.

65

//

66

if (!m_Data.m_Parameters.m_TimeMajor)

67

{

68

std::unique_ptr<arm_compute::NEPermute> layer(new arm_compute::NEPermute());

69

70

TensorInfo permuteOutInfo = inputInfo;

71

permuteOutInfo.SetShape(timeMajorShapeInput);

72

BuildArmComputeTensor(m_PermuteFirstOut, permuteOutInfo);

73

armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermuteFirstOut);

74

75

// Permute to time major format.

76

layer->configure(&input, &m_PermuteFirstOut, arm_compute::PermutationVector(0U,2U,1U));

77

m_Permute1.reset(layer.release());

}

//

// Split and Concat Tensors

82

//

83

for (unsigned int i = 0; i < maxTime; ++i)

84

{

85

arm_compute::Tensor splitter_out;

86

arm_compute::Tensor concat_in;

87

88

auto splitterTensorInfo = inputInfo;

89

auto concatTensorInfo = outputInfo;

90

splitterTensorInfo.SetShape({batchSize, inputSize});

91

concatTensorInfo.SetShape({batchSize, outputSize});

92

BuildArmComputeTensor(splitter_out, splitterTensorInfo);

93

BuildArmComputeTensor(concat_in, concatTensorInfo);

94

95

armcomputetensorutils::InitialiseArmComputeTensorEmpty(splitter_out);

96

armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_in);

97

98

// append to std::vector<arm_compute::Tensor>

99

m_SplitterOutputsTensors.push_back(std::move(splitter_out));

100

m_ConcatInputsTensors.push_back(std::move(concat_in));

101

}

102

103

for (unsigned int i = 0; i < maxTime; ++i)

104

{

105

// append to std::vector<arm_compute::ITensor*>

106

m_SplitterOutputs.push_back(&m_SplitterOutputsTensors[i]);

107

m_ConcatInputs.push_back(&m_ConcatInputsTensors[i]);

}

//

// Split

//

unsigned int numberDimensions = 3;

114

unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension)

115

116

if (maxTime != 1) // ACL split does not work with only one element to split.

117

{

118

ViewsDescriptor splitterDesc(maxTime, numberDimensions);

119

unsigned int splitterDimSizes[3] = {1, batchSize, inputSize};

120

for (unsigned int outputIdx = 0u; outputIdx < maxTime; ++outputIdx)

121

{

122

splitterDesc.SetViewOriginCoord(outputIdx, dimension, splitterDimSizes[dimension] * outputIdx);

123

for (unsigned int dimIdx = 0u; dimIdx < numberDimensions; ++dimIdx)

124

{

125

splitterDesc.SetViewSize(outputIdx, dimIdx, splitterDimSizes[dimIdx]);

}

}

std::set<unsigned int> splitAxis = ComputeSplitAxis(splitterDesc, timeMajorShapeInput);

130

131

std::unique_ptr<arm_compute::NESplit> split_layer(new arm_compute::NESplit());

132

unsigned int aclAxisSplit = CalcAclAxis(splitterDesc.GetNumDimensions(),

133

*splitAxis.begin());

134

if (!m_Data.m_Parameters.m_TimeMajor)

135

{

136

split_layer->configure(&m_PermuteFirstOut, m_SplitterOutputs, aclAxisSplit);

137

} else

138

{

139

split_layer->configure(&input, m_SplitterOutputs, aclAxisSplit);

140

}

141

142

split_layer->prepare();

143

m_Splitter.reset(split_layer.release());

}

//

// Lstm

//

arm_compute::LSTMParams<arm_compute::ITensor> lstm_param;

150

151

m_InputToForgetWeightsTensor = std::make_unique<arm_compute::Tensor>();

152

BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo());

153

154

m_InputToCellWeightsTensor = std::make_unique<arm_compute::Tensor>();

155

BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo());

156

157

m_InputToOutputWeightsTensor = std::make_unique<arm_compute::Tensor>();

158

BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo());

159

160

m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::Tensor>();

161

BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo());

162

163

m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::Tensor>();

164

BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo());

165

166

m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::Tensor>();

167

BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo());

168

169

m_ForgetGateBiasTensor = std::make_unique<arm_compute::Tensor>();

170

BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo());

171

172

m_CellBiasTensor = std::make_unique<arm_compute::Tensor>();

173

BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo());

174

175

m_OutputGateBiasTensor = std::make_unique<arm_compute::Tensor>();

176

BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo());

177

178

// for future reference: check the AndroidNN API for the logic here

179

if (!m_Data.m_Parameters.m_CifgEnabled)

180

{

181

m_InputToInputWeightsTensor = std::make_unique<arm_compute::Tensor>();

182

BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo());

183

184

m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::Tensor>();

185

BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo());

186

187

m_CellToInputWeightsTensor = std::make_unique<arm_compute::Tensor>();

188

if (m_Data.m_CellToInputWeights != nullptr)

189

{

190

BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo());

191

}

192

193

m_InputGateBiasTensor = std::make_unique<arm_compute::Tensor>();

194

BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo());

195

196

lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(),

197

m_RecurrentToInputWeightsTensor.get(),

198

m_Data.m_CellToInputWeights ? m_CellToInputWeightsTensor.get() : nullptr,

199

m_InputGateBiasTensor.get());

200

}

201

202

if (m_Data.m_Parameters.m_ProjectionEnabled)

203

{

204

m_ProjectionWeightsTensor = std::make_unique<arm_compute::Tensor>();

205

BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo());

206

207

m_ProjectionBiasTensor = std::make_unique<arm_compute::Tensor>();

208

if (m_Data.m_ProjectionBias != nullptr)

209

{

210

BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo());

211

}

212

213

lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(),

214

m_Data.m_ProjectionBias ? m_ProjectionBiasTensor.get() : nullptr);

215

}

216

217

if (m_Data.m_Parameters.m_PeepholeEnabled)

218

{

219

m_CellToForgetWeightsTensor = std::make_unique<arm_compute::Tensor>();

220

BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo());

221

222

m_CellToOutputWeightsTensor = std::make_unique<arm_compute::Tensor>();

223

BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo());

224

225

lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get());

226

}

227

228

if (m_Data.m_Parameters.m_LayerNormEnabled)

229

{

230

m_InputLayerNormWeightsTensor = std::make_unique<arm_compute::Tensor>();

231

if (!m_Data.m_Parameters.m_CifgEnabled)

232

{

233

BuildArmComputeTensor(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights->GetTensorInfo());

234

}

235

236

m_ForgetLayerNormWeightsTensor = std::make_unique<arm_compute::Tensor>();

237

BuildArmComputeTensor(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights->GetTensorInfo());

238

239

m_CellLayerNormWeightsTensor = std::make_unique<arm_compute::Tensor>();

240

BuildArmComputeTensor(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights->GetTensorInfo());

241

242

m_OutputLayerNormWeightsTensor = std::make_unique<arm_compute::Tensor>();

243

BuildArmComputeTensor(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights->GetTensorInfo());

244

245

auto inputNormWeightTensor = m_Data.m_Parameters.m_CifgEnabled ? nullptr : m_InputLayerNormWeightsTensor.get();

246

lstm_param.set_layer_normalization_params(inputNormWeightTensor,

247

m_ForgetLayerNormWeightsTensor.get(),

248

m_CellLayerNormWeightsTensor.get(),

249

m_OutputLayerNormWeightsTensor.get());

250

}

251

252

arm_compute::ITensor& output_state_in = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();

253

arm_compute::ITensor& cell_state_in = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();

254

255

arm_compute::ITensor& output_state_out = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();

256

arm_compute::ITensor& cell_state_out = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();

257

258

m_ScratchBuffer = std::make_unique<arm_compute::Tensor>();

259

if (m_Data.m_Parameters.m_CifgEnabled)

260

{

261

// scratch_buffer [num_units * 3, batch_size] with CIFG

262

BuildArmComputeTensor(*m_ScratchBuffer, TensorInfo({batchSize, numUnits * 3}, armnnDataType));

}

else

{

// scratch_buffer [num_units * 4, batch_size] without CIFG

267

BuildArmComputeTensor(*m_ScratchBuffer, TensorInfo({batchSize, numUnits * 4}, armnnDataType));

268

}

269

270

// Need to be set at negative threshold to be compatible for ACL

271

float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell;

272

float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj;

273

274

// For preparing the object for the class ActivationLayerInfo, consider 5 situations

275

arm_compute::ActivationLayerInfo activationLayerInfo =

276

ConvertLstmActivationFuncToAclLayerInfo(m_Data.m_Parameters.m_ActivationFunc);

277

278

for (unsigned int i = 0; i != maxTime; ++i)

279

{

280

// Set LSTM input and output ITensors depending on:

281

// input format (timeMajor) & number of LSTM batches (maxTime).

282

arm_compute::ITensor* outputLSTM;

283

arm_compute::ITensor* inputLSTM;

284

285

// If there is only one LSTM time major batch, we will not concat OR permute.

286

// Set input of LSTM to be first input ITensor.

287

// Set output of LSTM to be final output ITensor.

288

// LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.

289

if (maxTime == 1 && m_Data.m_Parameters.m_TimeMajor)

290

{

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

291

TensorShape inputShape = GetTensorShape(input.info()->tensor_shape(), 1U);

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

292

TensorShape outputShape = GetTensorShape((&output)->info()->tensor_shape(), 1U);

293

294

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

295

TensorShape outputShapeShrink({outputShape[1], outputShape[2]});

296

297

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

298

auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink);

299

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

300

input.info()->set_tensor_shape(acl_input_shape_shrink);

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

301

inputLSTM = const_cast<arm_compute::ITensor*>(&input);

302

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

303

output.info()->set_tensor_shape(acl_output_shape_shrink);

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

304

outputLSTM = &output;

305

}

306

// If there is only one LSTM batch major batch, we will not concat, only permute.

307

// Set input of LSTM to be output of initial permute.

308

// Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute.

309

// LSTM output cannot be > 2 dimensions so need to resize its TensorInfo.

310

else if (maxTime == 1 && !m_Data.m_Parameters.m_TimeMajor)

311

{

312

TensorShape inputShape = GetTensorShape(m_PermuteFirstOut.info()->tensor_shape(), 1U);

313

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

314

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

315

m_PermuteFirstOut.info()->set_tensor_shape(acl_input_shape_shrink);

316

inputLSTM = &m_PermuteFirstOut;

317

318

outputLSTM = const_cast<arm_compute::ITensor*>(m_ConcatInputs[i]);

319

}

320

// Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on.

321

else

322

{

323

inputLSTM = m_SplitterOutputs[i];

324

outputLSTM = const_cast<arm_compute::ITensor*>(m_ConcatInputs[i]);

325

}

326

327

std::unique_ptr<arm_compute::NELSTMLayer> lstm_layer(new arm_compute::NELSTMLayer());

328

lstm_layer->configure(inputLSTM,

329

m_InputToForgetWeightsTensor.get(),

330

m_InputToCellWeightsTensor.get(),

331

m_InputToOutputWeightsTensor.get(),

332

m_RecurrentToForgetWeightsTensor.get(),

333

m_RecurrentToCellWeightsTensor.get(),

334

m_RecurrentToOutputWeightsTensor.get(),

335

m_ForgetGateBiasTensor.get(),

336

m_CellBiasTensor.get(),

337

m_OutputGateBiasTensor.get(),

338

&output_state_in,

339

&cell_state_in,

340

m_ScratchBuffer.get(),

&output_state_out,

&cell_state_out,

outputLSTM,

lstm_param,

activationLayerInfo,

cell_threshold,

projection_threshold);

348

349

m_Layers.emplace_back(std::move(lstm_layer));

350

}

351

352

armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer);

353

354

InitializeArmComputeTensorData(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights);

355

InitializeArmComputeTensorData(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights);

356

InitializeArmComputeTensorData(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights);

357

InitializeArmComputeTensorData(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights);

358

InitializeArmComputeTensorData(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights);

359

InitializeArmComputeTensorData(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights);

360

InitializeArmComputeTensorData(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias);

361

InitializeArmComputeTensorData(*m_CellBiasTensor, m_Data.m_CellBias);

362

InitializeArmComputeTensorData(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias);

363

364

if (!m_Data.m_Parameters.m_CifgEnabled)

365

{

366

InitializeArmComputeTensorData(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights);

367

InitializeArmComputeTensorData(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights);

368

if (m_Data.m_CellToInputWeights != nullptr)

369

{

370

InitializeArmComputeTensorData(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights);

371

}

372

InitializeArmComputeTensorData(*m_InputGateBiasTensor, m_Data.m_InputGateBias);

373

}

374

375

if (m_Data.m_Parameters.m_ProjectionEnabled)

376

{

377

InitializeArmComputeTensorData(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights);

378

if (m_Data.m_ProjectionBias != nullptr)

379

{

380

InitializeArmComputeTensorData(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias);

}

}

if (m_Data.m_Parameters.m_PeepholeEnabled)

385

{

386

InitializeArmComputeTensorData(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights);

387

InitializeArmComputeTensorData(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights);

388

}

389

390

if (m_Data.m_Parameters.m_LayerNormEnabled)

391

{

392

if (!m_Data.m_Parameters.m_CifgEnabled)

393

{

394

InitializeArmComputeTensorData(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights);

395

}

396

InitializeArmComputeTensorData(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights);

397

InitializeArmComputeTensorData(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights);

398

InitializeArmComputeTensorData(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights);

399

}

400

401

// Force Compute Library to perform the necessary copying and reshaping.

402

// After which delete all the input tensors that will no longer be needed.

403

for (uint32_t i = 0; i < m_Layers.size(); ++i)

404

{

405

m_Layers[i]->prepare();

}

//

// Concat

//

// Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs.

413

TensorShape shape = GetTensorShape(m_ConcatInputs[0]->info()->tensor_shape(), 1U);

414

TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]});

415

TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]});

416

417

if (maxTime != 1) // ACL concat does not work with only one element to concatenate.

418

{

419

for (unsigned int i = 0; i < maxTime; ++i)

420

{

421

m_ConcatInputs[i]->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor));

422

}

423

424

ConcatDescriptor concatDescriptor(maxTime, numberDimensions); // maxTime = num inputs (aka. number of views).

425

for (unsigned int inputIdx = 0u; inputIdx < maxTime; ++inputIdx)

426

{

427

concatDescriptor.SetViewOriginCoord(inputIdx, dimension, inputIdx);

428

concatDescriptor.SetConcatAxis(dimension);

429

}

430

431

m_Concat.reset(new arm_compute::NEConcatenateLayer());

432

unsigned int aclAxisConcat = CalcAclAxis(concatDescriptor.GetNumDimensions(), concatDescriptor.GetConcatAxis());

433

if (!m_Data.m_Parameters.m_TimeMajor)

434

{

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

435

TensorInfo concatOutputTensorInfo = outputInfo;

436

concatOutputTensorInfo.SetShape(timeMajorShapeOutput);

437

BuildArmComputeTensor(concat_out, concatOutputTensorInfo);

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

438

armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_out);

439

440

m_Concat->configure(m_ConcatInputs, &concat_out, aclAxisConcat);

}

else

{

m_Concat->configure(m_ConcatInputs, &output, aclAxisConcat);

}

m_Concat->prepare();

}

// If only one LSTM batch, we do not concat and/or permute.

450

// Must ensure final output info is expanded to correct batch major dimensions.

451

else

452

{

453

if (!m_Data.m_Parameters.m_TimeMajor)

454

{

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

455

output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandBatchMajor));

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

456

}

457

else

458

{

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

459

output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor));

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

}

}

//

// Permute: only done if input/output are in batch major format.

465

//

466

if (!m_Data.m_Parameters.m_TimeMajor)

467

{

468

// Output now time major. Permute output back to batch major.

469

std::unique_ptr<arm_compute::NEPermute> layer(new arm_compute::NEPermute());

470

if (maxTime != 1)

471

{

472

layer->configure(&concat_out, &output, arm_compute::PermutationVector(0U, 2U, 1U));

}

else

{

layer->configure(m_ConcatInputs[0], &output, arm_compute::PermutationVector(0U, 2U, 1U));

477

}

478

m_Permute2.reset(layer.release());

}

FreeUnusedTensors();

}

void NeonUnidirectionalSequenceLstmFloatWorkload::Execute() const

485

{

Mike Kelly

7cbe781

2023-07-25 17:37:33 +0100

[diff] [blame]

486

ARMNN_SCOPED_PROFILING_EVENT_NEON_NAME_GUID("NeonUnidirectionalSequenceLstmFloatWorkload_Execute");

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

if (m_Permute1)

{

m_Permute1->run();

}

if (m_Splitter)

{

m_Splitter->run();

}

for (uint32_t i = 0; i < m_Layers.size(); ++i)

{

m_Layers[i]->run();

}

if (m_Concat)

{

m_Concat->run();

}

if (m_Permute2)

{

m_Permute2->run();

}

}

arm_compute::Status

NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input,

511

const TensorInfo& outputStateIn,

512

const TensorInfo& cellStateIn,

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

513

const TensorInfo& outputStateOut,

514

const TensorInfo& cellStateOut,

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

515

const TensorInfo& output,

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

516

const UnidirectionalSequenceLstmDescriptor& descriptor,

517

const LstmInputParamsInfo& paramsInfo)

518

{

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

519

TensorShape inputLayerShape = input.GetShape();

Narumol Prangnawarat

270641b

2023-05-22 10:57:47 +0100

[diff] [blame]

520

TensorShape outputLayerShape = output.GetShape();

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

521

Narumol Prangnawarat

2023-05-29 15:54:57 +0100

[diff] [blame]

522

if (inputLayerShape.GetNumDimensions() != 3)

523

{

524

return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,

525

"Unidirectional Sequence LSTM layer validate status failed.");

526

}

527

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

528

unsigned int maxTime = descriptor.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1];

529

unsigned int batchSize = descriptor.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0];

530

unsigned int inputSize = inputLayerShape[2];

531

unsigned int outputSize = outputLayerShape[2];

532

533

const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize});

534

const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize});

535

536

arm_compute::Status statusPermute1 = arm_compute::Status(arm_compute::ErrorCode::OK,

537

"Permute1 status");

538

arm_compute::Status statusSplit = arm_compute::Status(arm_compute::ErrorCode::OK,

539

"Split status");

540

arm_compute::Status statusLSTM = arm_compute::Status(arm_compute::ErrorCode::OK,

541

"LSTM status");

542

arm_compute::Status statusConcat = arm_compute::Status(arm_compute::ErrorCode::OK,

543

"Concat status");

544

arm_compute::Status statusPermute2 = arm_compute::Status(arm_compute::ErrorCode::OK,

545

"Permute2 status");

546

547

const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);

548

const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);

//

// Permute validate

//

Narumol Prangnawarat

2023-05-29 15:54:57 +0100

[diff] [blame]

553

TensorInfo permuteOutInfo = armnnUtils::Permuted(input, { 1U, 0U, 2U });

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

554

arm_compute::TensorInfo aclPermuteOutInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permuteOutInfo);

555

if (!descriptor.m_TimeMajor)

556

{

557

statusPermute1 = arm_compute::NEPermute::validate(&aclInputInfo,

558

&aclPermuteOutInfo,

559

arm_compute::PermutationVector(0U, 2U, 1U));

}

//

// Split and Concat Tensors validate

564

//

565

std::vector<arm_compute::TensorInfo> splitterOutputsTensorInfos;

566

std::vector<arm_compute::TensorInfo> concatInputsTensorInfos;

567

std::vector<arm_compute::ITensorInfo*> splitterOutputsTensorInfosPtr;

568

std::vector<const arm_compute::ITensorInfo*> concatInputsTensorInfosPtr;

569

splitterOutputsTensorInfos.reserve(maxTime);

570

concatInputsTensorInfos.reserve(maxTime);

571

for (unsigned int i = 0; i < maxTime; ++i)

572

{

573

arm_compute::TensorInfo splitter_out;

574

arm_compute::TensorInfo concat_in;

575

576

auto splitterTensorInfo = TensorInfo(input);

577

auto concatTensorInfo = TensorInfo(output);

578

splitterTensorInfo.SetShape({batchSize, inputSize});

579

concatTensorInfo.SetShape({batchSize, outputSize});

580

581

arm_compute::TensorInfo aclSplitterTensorInfo

582

= armcomputetensorutils::BuildArmComputeTensorInfo(splitterTensorInfo);

583

arm_compute::TensorInfo aclConcatTensorInfo

584

= armcomputetensorutils::BuildArmComputeTensorInfo(concatTensorInfo);

585

586

splitterOutputsTensorInfos.emplace_back(aclSplitterTensorInfo);

587

concatInputsTensorInfos.emplace_back(aclConcatTensorInfo);

588

splitterOutputsTensorInfosPtr.emplace_back(&splitterOutputsTensorInfos[i]);

589

concatInputsTensorInfosPtr.emplace_back(&concatInputsTensorInfos[i]);

}

//

// Split validate

//

unsigned int numberDimensions = 3;

596

unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension)

597

unsigned int aclAxisSplit = CalcAclAxis(numberDimensions, dimension);

598

599

if (maxTime != 1) // ACL split does not work with only one element to split.

600

{

601

if (!descriptor.m_TimeMajor)

602

{

603

statusSplit = arm_compute::NESplit::validate(&aclPermuteOutInfo,

604

splitterOutputsTensorInfosPtr,

605

aclAxisSplit);

Narumol Prangnawarat

5f94124

2023-08-11 16:09:26 +0100

[diff] [blame]

606

}

607

else

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

608

{

609

statusSplit = arm_compute::NESplit::validate(&aclInputInfo, splitterOutputsTensorInfosPtr, aclAxisSplit);

}

}

//

// LSTM validate

//

arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;

618

Narumol Prangnawarat

2023-05-29 15:54:57 +0100

[diff] [blame]

619

unsigned int numUnits = cellStateIn.GetShape()[1];

620

unsigned int scratchBufferFactor = 4;

621

622

if (descriptor.m_CifgEnabled)

623

{

624

// scratchBuffer = { batchSize, numUnits * 3 } with CIFG

625

scratchBufferFactor = 3;

626

}

627

628

const TensorInfo& scratchBuffer = TensorInfo({ batchSize, numUnits * scratchBufferFactor }, input.GetDataType());

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

629

630

// The inputs and outputs

631

const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn);

632

const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn);

633

const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer);

634

const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut);

635

const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut);

636

637

// Basic parameters

638

const arm_compute::TensorInfo aclInputToForgetWeightsInfo

639

= BuildArmComputeTensorInfo(paramsInfo.GetInputToForgetWeights());

640

const arm_compute::TensorInfo aclInputToCellWeightsInfo

641

= BuildArmComputeTensorInfo(paramsInfo.GetInputToCellWeights());

642

const arm_compute::TensorInfo aclInputToOutputWeightsInfo

643

= BuildArmComputeTensorInfo(paramsInfo.GetInputToOutputWeights());

644

const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo

645

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToForgetWeights());

646

const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo

647

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToCellWeights());

648

const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo

649

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToOutputWeights());

650

const arm_compute::TensorInfo aclForgetGateBiasInfo

651

= BuildArmComputeTensorInfo(paramsInfo.GetForgetGateBias());

652

const arm_compute::TensorInfo aclCellBiasInfo

653

= BuildArmComputeTensorInfo(paramsInfo.GetCellBias());

654

const arm_compute::TensorInfo aclOutputGateBiasInfo

655

= BuildArmComputeTensorInfo(paramsInfo.GetOutputGateBias());

656

657

arm_compute::TensorInfo aclInputToInputWeightsInfo;

658

arm_compute::TensorInfo aclRecurrentToInputWeightsInfo;

659

arm_compute::TensorInfo aclCellToInputWeightsInfo;

660

arm_compute::TensorInfo aclInputGateBiasInfo;

661

arm_compute::TensorInfo aclProjectionWeightsInfo;

662

arm_compute::TensorInfo aclProjectionBiasInfo;

663

arm_compute::TensorInfo aclCellToForgetWeightsInfo;

664

arm_compute::TensorInfo aclCellToOutputWeightsInfo;

665

666

arm_compute::TensorInfo aclInputLayerNormWeightsInfo;

667

arm_compute::TensorInfo aclForgetLayerNormWeightsInfo;

668

arm_compute::TensorInfo aclCellLayerNormWeightsInfo;

669

arm_compute::TensorInfo aclOutputLayerNormWeightsInfo;

670

671

672

if (!descriptor.m_CifgEnabled)

673

{

674

if (descriptor.m_PeepholeEnabled)

675

{

676

aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToInputWeights());

677

}

678

aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputToInputWeights());

679

aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToInputWeights());

680

aclInputGateBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputGateBias());

681

682

lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo,

683

&aclRecurrentToInputWeightsInfo,

684

descriptor.m_PeepholeEnabled ? &aclCellToInputWeightsInfo : nullptr,

685

&aclInputGateBiasInfo);

686

}

687

688

if (descriptor.m_ProjectionEnabled)

689

{

690

if (paramsInfo.m_ProjectionBias != nullptr)

691

{

692

aclProjectionBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionBias());

693

}

694

aclProjectionWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionWeights());

695

696

lstm_params_info.set_projection_params(&aclProjectionWeightsInfo,

697

paramsInfo.m_ProjectionBias ? &aclProjectionBiasInfo : nullptr);

698

}

699

700

if (descriptor.m_PeepholeEnabled)

701

{

702

aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToForgetWeights());

703

aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToOutputWeights());

704

705

lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo);

706

}

707

708

if (descriptor.m_LayerNormEnabled)

709

{

710

if (!descriptor.m_CifgEnabled)

711

{

712

aclInputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputLayerNormWeights());

713

}

714

aclForgetLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetForgetLayerNormWeights());

715

aclCellLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellLayerNormWeights());

716

aclOutputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetOutputLayerNormWeights());

717

718

lstm_params_info.set_layer_normalization_params(descriptor.m_CifgEnabled ? nullptr :

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

719

&aclInputLayerNormWeightsInfo,

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

720

&aclForgetLayerNormWeightsInfo,

721

&aclCellLayerNormWeightsInfo,

722

&aclOutputLayerNormWeightsInfo);

723

}

724

725

// Need to be set at negative threshold to be compatible for ACL

726

float cell_threshold = descriptor.m_ClippingThresCell;

727

float projection_threshold = descriptor.m_ClippingThresProj;

728

729

arm_compute::ActivationLayerInfo activationLayerInfo =

730

ConvertLstmActivationFuncToAclLayerInfo(descriptor.m_ActivationFunc);

731

732

for (unsigned int i = 0; i != maxTime; ++i)

733

{

734

735

// Set LSTM input and output ITensors depending on:

736

// input format (timeMajor) & number of LSTM batches (maxTime).

737

arm_compute::ITensorInfo* outputLSTM;

738

arm_compute::ITensorInfo* inputLSTM;

739

740

// If there is only one LSTM time major batch, we will not concat OR permute.

741

// Set input of LSTM to be first input ITensor.

742

// Set output of LSTM to be final output ITensor.

743

// LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.

Narumol Prangnawarat

5f94124

2023-08-11 16:09:26 +0100

[diff] [blame]

744

if (maxTime == 1 && descriptor.m_TimeMajor)

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

745

{

746

TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U);

747

TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U);

748

749

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

750

TensorShape outputShapeShrink({outputShape[1], outputShape[2]});

751

752

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

753

auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink);

754

755

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(acl_input_shape_shrink);

756

inputLSTM = const_cast<arm_compute::TensorInfo*>(&aclInputInfo);

757

758

const_cast<arm_compute::TensorInfo*>(&aclOutputInfo)->set_tensor_shape(acl_output_shape_shrink);

759

outputLSTM = const_cast<arm_compute::TensorInfo*>(&aclOutputInfo);

760

}

761

// If there is only one LSTM batch major batch, we will not concat, only permute.

762

// Set input of LSTM to be output of initial permute.

763

// Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute.

764

// LSTM output cannot be > 2 dimensions so need to resize its TensorInfo.

765

else if (maxTime == 1 && !descriptor.m_TimeMajor)

766

{

767

TensorShape inputShape = GetTensorShape(aclPermuteOutInfo.tensor_shape(), 1U);

768

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

769

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

770

aclPermuteOutInfo.set_tensor_shape(acl_input_shape_shrink);

771

inputLSTM = &aclPermuteOutInfo;

772

773

outputLSTM = const_cast<arm_compute::ITensorInfo*>(concatInputsTensorInfosPtr[i]);

774

}

775

// Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on.

776

else

777

{

778

inputLSTM = splitterOutputsTensorInfosPtr[i];

779

outputLSTM = const_cast<arm_compute::ITensorInfo*>(concatInputsTensorInfosPtr[i]);

780

}

781

782

statusLSTM = arm_compute::NELSTMLayer::validate(inputLSTM,

783

&aclInputToForgetWeightsInfo,

784

&aclInputToCellWeightsInfo,

785

&aclInputToOutputWeightsInfo,

786

&aclRecurrentToForgetWeightsInfo,

787

&aclRecurrentToCellWeightsInfo,

788

&aclRecurrentToOutputWeightsInfo,

789

&aclForgetGateBiasInfo,

790

&aclCellBiasInfo,

791

&aclOutputGateBiasInfo,

792

&aclOutputStateInInfo,

793

&aclCellStateInInfo,

794

&aclScratchBufferInfo,

795

&aclOutputStateOutInfo,

796

&aclCellStateOutInfo,

outputLSTM,

lstm_params_info,

activationLayerInfo,

cell_threshold,

projection_threshold);

802

803

if (statusLSTM.error_code() != arm_compute::ErrorCode::OK)

{

break;

}

}

//

// Concat validate

//

// Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs.

814

TensorShape shape = GetTensorShape(concatInputsTensorInfosPtr[0]->tensor_shape(), 1U);

815

TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]});

816

TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]});

817

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

818

TensorInfo concatOutputTensorInfo = TensorInfo(output);

819

concatOutputTensorInfo.SetShape(timeMajorShapeOutput);

820

arm_compute::TensorInfo aclConcatOutputTensorInfo= BuildArmComputeTensorInfo(concatOutputTensorInfo);

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

821

822

if (maxTime != 1) // ACL concat does not work with only one element to concatenate.

823

{

824

for (unsigned int i = 0; i < maxTime; ++i)

825

{

826

auto acl_shape_expand = BuildArmComputeTensorShape(shapeExpandTimeMajor);

827

concatInputsTensorInfos[i].set_tensor_shape(acl_shape_expand);

828

}

829

830

unsigned int aclAxisConcat = CalcAclAxis(numberDimensions, dimension);

831

if (!descriptor.m_TimeMajor)

832

{

833

statusConcat = arm_compute::NEConcatenateLayer::validate(concatInputsTensorInfosPtr,

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

834

&aclConcatOutputTensorInfo,

Cathal Corbett

2022-03-03 15:13:23 +0000

[diff] [blame]

aclAxisConcat);

}

else

{

statusConcat = arm_compute::NEConcatenateLayer::validate(concatInputsTensorInfosPtr,

&aclOutputInfo,

aclAxisConcat);

}

}

// If only one LSTM batch, we do not concat and/or permute.

845

// Must ensure final output info is expanded to correct batch major dimensions.

846

else

847

{

848

if (!descriptor.m_TimeMajor)

849

{

850

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(

851

BuildArmComputeTensorShape(shapeExpandBatchMajor));

}

else

{

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(

856

BuildArmComputeTensorShape(shapeExpandTimeMajor));

}

}

//

// Permute validate

//

if (!descriptor.m_TimeMajor)

864

{

865

// Output now time major. Permute output back to batch major.

866

if (maxTime != 1)

867

{

Mike Kelly

2022-04-21 11:57:09 +0100

[diff] [blame]

868

statusPermute2 = arm_compute::NEPermute::validate(&aclConcatOutputTensorInfo,

Cathal Corbett