Blame - src/backends/cl/workloads/ClUnidirectionalSequenceLstmFloatWorkload.cpp - ml/armnn

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

1

//

Narumol Prangnawarat

71311e4

2023-05-29 15:54:57 +0100

[diff] [blame^]

2

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

3

// SPDX-License-Identifier: MIT

4

//

5

6

#include "ClUnidirectionalSequenceLstmFloatWorkload.hpp"

7

#include "ClWorkloadUtils.hpp"

8

9

#include <aclCommon/ArmComputeUtils.hpp>

10

#include <aclCommon/ArmComputeTensorUtils.hpp>

11

12

#include <armnn/utility/NumericCast.hpp>

13

#include <armnnUtils/Permute.hpp>

14

#include <cl/test/ClWorkloadFactoryHelper.hpp>

15

#include <backendsCommon/WorkloadUtils.hpp>

16

17

#include "cl/ClTensorHandle.hpp"

namespace

{

unsigned int CalcAclAxis(unsigned int numDimensions, unsigned int axis)

22

{

23

return (numDimensions - axis) - 1;

}

} //namespace

namespace armnn

{

using namespace armcomputetensorutils;

30

31

ClUnidirectionalSequenceLstmFloatWorkload::ClUnidirectionalSequenceLstmFloatWorkload

32

(const UnidirectionalSequenceLstmQueueDescriptor& descriptor,

33

const WorkloadInfo& info,

34

const arm_compute::CLCompileContext& clCompileContext)

35

: FloatWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)

36

{

37

// Report Profiling Details

38

ARMNN_REPORT_PROFILING_WORKLOAD_DESC("ClUnidirectionalSequenceLstmFloatWorkload_Construct",

39

descriptor.m_Parameters,

info,

GetGuid());

const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();

Mike Kelly

1299496

2022-04-21 11:57:09 +0100

[diff] [blame]

44

arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetTensor();

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

45

46

TensorInfo inputInfo = info.m_InputTensorInfos[0];

Mike Kelly

1299496

2022-04-21 11:57:09 +0100

[diff] [blame]

47

TensorInfo outputInfo = info.m_OutputTensorInfos[2];

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

48

49

arm_compute::DataType armComputeDataType = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetDataType();

50

armnn::DataType armnnDataType = GetArmNNDataType(armComputeDataType);

51

52

TensorShape inputLayerShape = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetShape();

53

TensorShape cellStateLayerShape = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetShape();

Mike Kelly

1299496

2022-04-21 11:57:09 +0100

[diff] [blame]

54

TensorShape outputLayerShape = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetShape();

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

55

56

unsigned int maxTime = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1];

57

unsigned int batchSize = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0];

58

unsigned int inputSize = inputLayerShape[2];

59

unsigned int outputSize = outputLayerShape[2];

60

unsigned int numUnits = cellStateLayerShape[1];

61

62

const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize});

63

const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize});

64

65

//

66

// Permute: performed if Unidirectional Sequence Layer inputs/outputs are in batch major format.

67

//

68

if (!m_Data.m_Parameters.m_TimeMajor)

69

{

70

std::unique_ptr<arm_compute::CLPermute> layer(new arm_compute::CLPermute());

71

72

TensorInfo permuteOutInfo = inputInfo;

73

permuteOutInfo.SetShape(timeMajorShapeInput);

74

BuildArmComputeTensor(m_PermuteFirstOut, permuteOutInfo);

75

armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermuteFirstOut);

76

77

// Permute to time major format.

78

layer->configure(clCompileContext, &input, &m_PermuteFirstOut, arm_compute::PermutationVector(0U,2U,1U));

79

m_Permute1.reset(layer.release());

}

//

// Split and Concat Tensors

84

//

85

for (unsigned int i = 0; i < maxTime; ++i)

86

{

87

arm_compute::CLTensor splitter_out;

88

arm_compute::CLTensor concat_in;

89

90

auto splitterTensorInfo = inputInfo;

91

auto concatTensorInfo = outputInfo;

92

splitterTensorInfo.SetShape({batchSize, inputSize});

93

concatTensorInfo.SetShape({batchSize, outputSize});

94

BuildArmComputeTensor(splitter_out, splitterTensorInfo);

95

BuildArmComputeTensor(concat_in, concatTensorInfo);

96

97

armcomputetensorutils::InitialiseArmComputeTensorEmpty(splitter_out);

98

armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_in);

99

100

// append to std::vector<arm_compute::CLTensor>

101

m_SplitterOutputsTensors.push_back(std::move(splitter_out));

102

m_ConcatInputsTensors.push_back(std::move(concat_in));

103

}

104

105

for (unsigned int i = 0; i < maxTime; ++i)

106

{

107

// append to std::vector<arm_compute::ICLTensor*>

108

m_SplitterOutputs.push_back(&m_SplitterOutputsTensors[i]);

109

m_ConcatInputs.push_back(&m_ConcatInputsTensors[i]);

}

//

// Split

//

unsigned int numberDimensions = 3;

116

unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension)

117

118

if (maxTime != 1) // ACL split does not work with only one element to split.

119

{

120

ViewsDescriptor splitterDesc(maxTime, numberDimensions);

121

unsigned int splitterDimSizes[3] = {1, batchSize, inputSize};

122

for (unsigned int outputIdx = 0u; outputIdx < maxTime; ++outputIdx)

123

{

124

splitterDesc.SetViewOriginCoord(outputIdx, dimension, splitterDimSizes[dimension] * outputIdx);

125

for (unsigned int dimIdx = 0u; dimIdx < numberDimensions; ++dimIdx)

126

{

127

splitterDesc.SetViewSize(outputIdx, dimIdx, splitterDimSizes[dimIdx]);

}

}

std::set<unsigned int> splitAxis = ComputeSplitAxis(splitterDesc, timeMajorShapeInput);

132

133

std::unique_ptr<arm_compute::CLSplit> split_layer(new arm_compute::CLSplit());

134

unsigned int aclAxisSplit = CalcAclAxis(splitterDesc.GetNumDimensions(), *splitAxis.begin());

135

if (!m_Data.m_Parameters.m_TimeMajor)

136

{

137

split_layer->configure(&m_PermuteFirstOut, m_SplitterOutputs, aclAxisSplit);

}

else

{

split_layer->configure(&input, m_SplitterOutputs, aclAxisSplit);

142

}

143

144

split_layer->prepare();

145

m_Splitter.reset(split_layer.release());

}

//

// Lstm

//

arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param;

152

153

m_InputToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();

154

BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo());

155

156

m_InputToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();

157

BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo());

158

159

m_InputToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

160

BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo());

161

162

m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();

163

BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo());

164

165

m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();

166

BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo());

167

168

m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

169

BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo());

170

171

m_ForgetGateBiasTensor = std::make_unique<arm_compute::CLTensor>();

172

BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo());

173

174

m_CellBiasTensor = std::make_unique<arm_compute::CLTensor>();

175

BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo());

176

177

m_OutputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();

178

BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo());

179

180

// for future reference: check the AndroidNN API for the logic here

181

if (!m_Data.m_Parameters.m_CifgEnabled)

182

{

183

m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

184

BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo());

185

186

m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

187

BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo());

188

189

m_CellToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

190

if (m_Data.m_CellToInputWeights != nullptr)

191

{

192

BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo());

193

}

194

195

m_InputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();

196

BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo());

197

198

lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(),

199

m_RecurrentToInputWeightsTensor.get(),

200

m_Data.m_CellToInputWeights ? m_CellToInputWeightsTensor.get() : nullptr,

201

m_InputGateBiasTensor.get());

202

}

203

204

if (m_Data.m_Parameters.m_ProjectionEnabled)

205

{

206

m_ProjectionWeightsTensor = std::make_unique<arm_compute::CLTensor>();

207

BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo());

208

209

m_ProjectionBiasTensor = std::make_unique<arm_compute::CLTensor>();

210

if (m_Data.m_ProjectionBias != nullptr)

211

{

212

BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo());

213

}

214

215

lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(),

216

m_Data.m_ProjectionBias ? m_ProjectionBiasTensor.get() : nullptr);

217

}

218

219

if (m_Data.m_Parameters.m_PeepholeEnabled)

220

{

221

m_CellToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();

222

BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo());

223

224

m_CellToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();

225

BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo());

226

227

lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get());

228

}

229

230

if (m_Data.m_Parameters.m_LayerNormEnabled)

231

{

232

m_InputLayerNormWeightsTensor = std::make_unique<arm_compute::CLTensor>();

233

if (!m_Data.m_Parameters.m_CifgEnabled)

234

{

235

BuildArmComputeTensor(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights->GetTensorInfo());

236

}

237

238

m_ForgetLayerNormWeightsTensor = std::make_unique<arm_compute::CLTensor>();

239

BuildArmComputeTensor(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights->GetTensorInfo());

240

241

m_CellLayerNormWeightsTensor = std::make_unique<arm_compute::CLTensor>();

242

BuildArmComputeTensor(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights->GetTensorInfo());

243

244

m_OutputLayerNormWeightsTensor = std::make_unique<arm_compute::CLTensor>();

245

BuildArmComputeTensor(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights->GetTensorInfo());

246

247

auto inputNormWeightTensor = m_Data.m_Parameters.m_CifgEnabled ? nullptr : m_InputLayerNormWeightsTensor.get();

248

lstm_param.set_layer_normalization_params(inputNormWeightTensor,

249

m_ForgetLayerNormWeightsTensor.get(),

250

m_CellLayerNormWeightsTensor.get(),

251

m_OutputLayerNormWeightsTensor.get());

252

}

253

254

arm_compute::ICLTensor& output_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();

255

arm_compute::ICLTensor& cell_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();

256

257

arm_compute::ICLTensor& output_state_out = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();

258

arm_compute::ICLTensor& cell_state_out = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();

259

260

m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>();

261

if (m_Data.m_Parameters.m_CifgEnabled)

262

{

263

// scratch_buffer [num_units * 3, batch_size] with CIFG

264

BuildArmComputeTensor(*m_ScratchBuffer, TensorInfo({batchSize, numUnits * 3}, armnnDataType));

}

else

{

// scratch_buffer [num_units * 4, batch_size] without CIFG

269

BuildArmComputeTensor(*m_ScratchBuffer, TensorInfo({batchSize, numUnits * 4}, armnnDataType));

270

}

271

272

// Need to be set at negative threshold to be compatible for ACL

273

float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell;

274

float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj;

275

276

// For preparing the object for the class ActivationLayerInfo, consider 5 situations

277

arm_compute::ActivationLayerInfo activationLayerInfo =

278

ConvertLstmActivationFuncToAclLayerInfo(m_Data.m_Parameters.m_ActivationFunc);

279

280

for (unsigned int i = 0; i != maxTime; ++i)

281

{

282

// Set LSTM input and output ITensors depending on:

283

// input format (timeMajor) & number of LSTM batches (maxTime).

284

arm_compute::ICLTensor* outputLSTM;

285

arm_compute::ICLTensor* inputLSTM;

286

// If there is only one LSTM time major batch, we will not concat OR permute.

287

// Set input of LSTM to be first input ITensor.

288

// Set output of LSTM to be final output ITensor.

289

// LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.

290

if (maxTime == 1 && m_Data.m_Parameters.m_TimeMajor)

291

{

292

TensorShape inputShape = GetTensorShape((&input)->info()->tensor_shape(), 1U);

293

TensorShape outputShape = GetTensorShape((&output)->info()->tensor_shape(), 1U);

294

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

295

TensorShape outputShapeShrink({outputShape[1], outputShape[2]});

296

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

297

auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink);

298

(&input)->info()->set_tensor_shape(acl_input_shape_shrink);

299

inputLSTM = const_cast<arm_compute::ICLTensor*>(&input);

300

(&output)->info()->set_tensor_shape(acl_output_shape_shrink);

301

outputLSTM = &output;

302

}

303

// If there is only one LSTM batch major batch, we will not concat, only permute.

304

// Set input of LSTM to be output of initial permute.

305

// Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute.

306

// LSTM output cannot be > 2 dimensions so need to resize its TensorInfo.

307

else if (maxTime == 1 && !m_Data.m_Parameters.m_TimeMajor)

308

{

309

TensorShape inputShape = GetTensorShape(m_PermuteFirstOut.info()->tensor_shape(), 1U);

310

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

311

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

312

m_PermuteFirstOut.info()->set_tensor_shape(acl_input_shape_shrink);

313

inputLSTM = &m_PermuteFirstOut;

314

outputLSTM = const_cast<arm_compute::ICLTensor*>(m_ConcatInputs[i]);

315

}

316

// Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on.

317

else

318

{

319

inputLSTM = m_SplitterOutputs[i];

320

outputLSTM = const_cast<arm_compute::ICLTensor*>(m_ConcatInputs[i]);

321

}

322

323

std::unique_ptr<arm_compute::CLLSTMLayer> lstm_layer(new arm_compute::CLLSTMLayer());

324

lstm_layer->configure(clCompileContext,

325

inputLSTM,

326

m_InputToForgetWeightsTensor.get(),

327

m_InputToCellWeightsTensor.get(),

328

m_InputToOutputWeightsTensor.get(),

329

m_RecurrentToForgetWeightsTensor.get(),

330

m_RecurrentToCellWeightsTensor.get(),

331

m_RecurrentToOutputWeightsTensor.get(),

332

m_ForgetGateBiasTensor.get(),

333

m_CellBiasTensor.get(),

334

m_OutputGateBiasTensor.get(),

335

&output_state_in,

336

&cell_state_in,

337

m_ScratchBuffer.get(),

&output_state_out,

&cell_state_out,

outputLSTM,

lstm_param,

activationLayerInfo,

cell_threshold,

projection_threshold);

345

346

m_Layers.emplace_back(std::move(lstm_layer));

347

}

348

349

armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer);

350

351

InitializeArmComputeClTensorData(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights);

352

InitializeArmComputeClTensorData(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights);

353

InitializeArmComputeClTensorData(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights);

354

InitializeArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights);

355

InitializeArmComputeClTensorData(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights);

356

InitializeArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights);

357

InitializeArmComputeClTensorData(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias);

358

InitializeArmComputeClTensorData(*m_CellBiasTensor, m_Data.m_CellBias);

359

InitializeArmComputeClTensorData(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias);

360

361

if (!m_Data.m_Parameters.m_CifgEnabled)

362

{

363

InitializeArmComputeClTensorData(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights);

364

InitializeArmComputeClTensorData(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights);

365

if (m_Data.m_CellToInputWeights != nullptr)

366

{

367

InitializeArmComputeClTensorData(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights);

368

}

369

InitializeArmComputeClTensorData(*m_InputGateBiasTensor, m_Data.m_InputGateBias);

370

}

371

372

if (m_Data.m_Parameters.m_ProjectionEnabled)

373

{

374

InitializeArmComputeClTensorData(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights);

375

if (m_Data.m_ProjectionBias != nullptr)

376

{

377

InitializeArmComputeClTensorData(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias);

}

}

if (m_Data.m_Parameters.m_PeepholeEnabled)

382

{

383

InitializeArmComputeClTensorData(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights);

384

InitializeArmComputeClTensorData(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights);

385

}

386

387

if (m_Data.m_Parameters.m_LayerNormEnabled)

388

{

389

if (!m_Data.m_Parameters.m_CifgEnabled)

390

{

391

InitializeArmComputeClTensorData(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights);

392

}

393

InitializeArmComputeClTensorData(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights);

394

InitializeArmComputeClTensorData(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights);

395

InitializeArmComputeClTensorData(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights);

396

}

397

398

// Force Compute Library to perform the necessary copying and reshaping.

399

// After which delete all the input tensors that will no longer be needed.

400

for (uint32_t i = 0; i < m_Layers.size(); ++i)

401

{

402

m_Layers[i]->prepare();

}

//

// Concat

//

// Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs.

410

TensorShape shape = GetTensorShape(m_ConcatInputs[0]->info()->tensor_shape(), 1U);

411

TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]});

412

TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]});

413

414

if (maxTime != 1) // ACL concat does not work with only one element to concatenate.

415

{

416

for (unsigned int i = 0; i < maxTime; ++i)

417

{

418

m_ConcatInputs[i]->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor));

419

}

420

421

ConcatDescriptor concatDescriptor(maxTime, numberDimensions); // maxTime = num inputs (aka. number of views).

422

for (unsigned int inputIdx = 0u; inputIdx < maxTime; ++inputIdx)

423

{

424

concatDescriptor.SetViewOriginCoord(inputIdx, dimension, inputIdx);

425

concatDescriptor.SetConcatAxis(dimension);

426

}

427

428

m_Concat.reset(new arm_compute::CLConcatenateLayer());

429

unsigned int aclAxisConcat = CalcAclAxis(concatDescriptor.GetNumDimensions(),

430

concatDescriptor.GetConcatAxis());

431

if (!m_Data.m_Parameters.m_TimeMajor)

432

{

433

TensorInfo concatOuputTensorInfo = outputInfo;

434

concatOuputTensorInfo.SetShape(timeMajorShapeOutput);

435

BuildArmComputeTensor(concat_out, concatOuputTensorInfo);

436

armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_out);

437

438

m_Concat->configure(m_ConcatInputs, &concat_out, aclAxisConcat);

}

else

{

m_Concat->configure(m_ConcatInputs, &output, aclAxisConcat);

}

m_Concat->prepare();

}

// If only one LSTM batch, we do not concat and/or permute.

448

// Must ensure final output info is expanded to correct batch major dimensions.

449

else

450

{

451

if (!m_Data.m_Parameters.m_TimeMajor)

452

{

453

(&output)->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandBatchMajor));

}

else

{

(&output)->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor));

}

}

//

// Permute: only done if input/output are in batch major format.

463

//

464

if (!m_Data.m_Parameters.m_TimeMajor)

465

{

466

// Output now time major. Permute output back to batch major.

467

std::unique_ptr<arm_compute::CLPermute> layer(new arm_compute::CLPermute());

468

if (maxTime != 1)

469

{

470

layer->configure(clCompileContext, &concat_out, &output, arm_compute::PermutationVector(0U, 2U, 1U));

}

else

{

layer->configure(clCompileContext, m_ConcatInputs[0], &output, arm_compute::PermutationVector(0U, 2U, 1U));

475

}

476

m_Permute2.reset(layer.release());

}

FreeUnusedTensors();

}

void ClUnidirectionalSequenceLstmFloatWorkload::Execute() const

483

{

484

ARMNN_SCOPED_PROFILING_EVENT_CL_GUID("ClUnidirectionalSequenceLstmFloatWorkload_Execute", GetGuid());

if (m_Permute1)

{

m_Permute1->run();

}

if (m_Splitter)

{

m_Splitter->run();

}

for (uint32_t i = 0; i < m_Layers.size(); ++i)

{

m_Layers[i]->run();

}

if (m_Concat)

{

m_Concat->run();

}

if (m_Permute2)

{

m_Permute2->run();

}

}

arm_compute::Status

ClUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input,

509

const TensorInfo& outputStateIn,

510

const TensorInfo& cellStateIn,

Narumol Prangnawarat

71311e4

2023-05-29 15:54:57 +0100

[diff] [blame^]

511

const TensorInfo& outputStateOut,

512

const TensorInfo& cellStateOut,

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

513

const TensorInfo& output,

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

514

const UnidirectionalSequenceLstmDescriptor& descriptor,

515

const LstmInputParamsInfo& paramsInfo)

516

{

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

517

TensorShape inputLayerShape = input.GetShape();

Narumol Prangnawarat

270641b

2023-05-22 10:57:47 +0100

[diff] [blame]

518

TensorShape outputLayerShape = output.GetShape();

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

519

Narumol Prangnawarat

71311e4

2023-05-29 15:54:57 +0100

[diff] [blame^]

520

if (inputLayerShape.GetNumDimensions() != 3)

521

{

522

return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,

523

"Unidirectional Sequence LSTM layer validate status failed.");

524

}

525

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

526

unsigned int maxTime = descriptor.m_TimeMajor?inputLayerShape[0]:inputLayerShape[1];

527

unsigned int batchSize = descriptor.m_TimeMajor?inputLayerShape[1]:inputLayerShape[0];

528

unsigned int inputSize = inputLayerShape[2];

529

unsigned int outputSize = outputLayerShape[2];

530

531

const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize});

532

const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize});

533

534

arm_compute::Status statusPermute1 = arm_compute::Status(arm_compute::ErrorCode::OK,

535

"Permute1 status");

536

arm_compute::Status statusSplit = arm_compute::Status(arm_compute::ErrorCode::OK,

537

"Split status");

538

arm_compute::Status statusLSTM = arm_compute::Status(arm_compute::ErrorCode::OK,

539

"LSTM status");

540

arm_compute::Status statusConcat = arm_compute::Status(arm_compute::ErrorCode::OK,

541

"Concat status");

542

arm_compute::Status statusPermute2 = arm_compute::Status(arm_compute::ErrorCode::OK,

543

"Permute2 status");

544

545

const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);

546

const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);

//

// Permute validate

//

Narumol Prangnawarat

71311e4

2023-05-29 15:54:57 +0100

[diff] [blame^]

551

TensorInfo permuteOutInfo = armnnUtils::Permuted(input, { 1U, 0U, 2U });

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

552

arm_compute::TensorInfo aclPermuteOutInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permuteOutInfo);

553

if (!descriptor.m_TimeMajor)

554

{

555

statusPermute1 = arm_compute::CLPermute::validate(&aclInputInfo,

556

&aclPermuteOutInfo,

557

arm_compute::PermutationVector(0U, 2U, 1U));

}

//

// Split and Concat Tensors validate

562

//

563

std::vector<arm_compute::TensorInfo> splitterOutputsTensorInfos;

564

std::vector<arm_compute::TensorInfo> concatInputsTensorInfos;

565

std::vector<arm_compute::ITensorInfo*> splitterOutputsTensorInfosPtr;

566

std::vector<const arm_compute::ITensorInfo*> concatInputsTensorInfosPtr;

567

splitterOutputsTensorInfos.reserve(maxTime);

568

concatInputsTensorInfos.reserve(maxTime);

569

for (unsigned int i = 0; i < maxTime; ++i)

570

{

571

arm_compute::TensorInfo splitter_out;

572

arm_compute::TensorInfo concat_in;

573

574

auto splitterTensorInfo = TensorInfo(input);

575

auto concatTensorInfo = TensorInfo(output);

576

splitterTensorInfo.SetShape({batchSize, inputSize});

577

concatTensorInfo.SetShape({batchSize, outputSize});

578

579

arm_compute::TensorInfo aclSplitterTensorInfo

580

= armcomputetensorutils::BuildArmComputeTensorInfo(splitterTensorInfo);

581

arm_compute::TensorInfo aclConcatTensorInfo

582

= armcomputetensorutils::BuildArmComputeTensorInfo(concatTensorInfo);

583

584

splitterOutputsTensorInfos.emplace_back(aclSplitterTensorInfo);

585

concatInputsTensorInfos.emplace_back(aclConcatTensorInfo);

586

splitterOutputsTensorInfosPtr.emplace_back(&splitterOutputsTensorInfos[i]);

587

concatInputsTensorInfosPtr.emplace_back(&concatInputsTensorInfos[i]);

}

//

// Split validate

//

unsigned int numberDimensions = 3;

594

unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension)

595

unsigned int aclAxisSplit = CalcAclAxis(numberDimensions, dimension);

596

597

if (maxTime != 1) // ACL split does not work with only one element to split.

598

{

599

if (!descriptor.m_TimeMajor)

600

{

601

statusSplit = arm_compute::CLSplit::validate(&aclPermuteOutInfo,

602

splitterOutputsTensorInfosPtr,

aclAxisSplit);

}

else

{

statusSplit = arm_compute::CLSplit::validate(&aclInputInfo, splitterOutputsTensorInfosPtr, aclAxisSplit);

}

}

//

// LSTM validate

//

arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;

616

Narumol Prangnawarat

71311e4

2023-05-29 15:54:57 +0100

[diff] [blame^]

617

unsigned int numUnits = cellStateIn.GetShape()[1];

618

unsigned int scratchBufferFactor = 4;

619

620

if (descriptor.m_CifgEnabled)

621

{

622

// scratchBuffer = { batchSize, numUnits * 3 } with CIFG

623

scratchBufferFactor = 3;

624

}

625

626

const TensorInfo& scratchBuffer = TensorInfo({ batchSize, numUnits * scratchBufferFactor }, input.GetDataType());

Cathal Corbett

4952a3e

2022-03-03 15:14:18 +0000

[diff] [blame]

627

628

// The inputs and outputs

629

const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn);

630

const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn);

631

const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer);

632

const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut);

633

const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut);

634

635

// Basic parameters

636

const arm_compute::TensorInfo aclInputToForgetWeightsInfo

637

= BuildArmComputeTensorInfo(paramsInfo.GetInputToForgetWeights());

638

const arm_compute::TensorInfo aclInputToCellWeightsInfo

639

= BuildArmComputeTensorInfo(paramsInfo.GetInputToCellWeights());

640

const arm_compute::TensorInfo aclInputToOutputWeightsInfo

641

= BuildArmComputeTensorInfo(paramsInfo.GetInputToOutputWeights());

642

const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo

643

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToForgetWeights());

644

const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo

645

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToCellWeights());

646

const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo

647

= BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToOutputWeights());

648

const arm_compute::TensorInfo aclForgetGateBiasInfo

649

= BuildArmComputeTensorInfo(paramsInfo.GetForgetGateBias());

650

const arm_compute::TensorInfo aclCellBiasInfo

651

= BuildArmComputeTensorInfo(paramsInfo.GetCellBias());

652

const arm_compute::TensorInfo aclOutputGateBiasInfo

653

= BuildArmComputeTensorInfo(paramsInfo.GetOutputGateBias());

654

655

arm_compute::TensorInfo aclInputToInputWeightsInfo;

656

arm_compute::TensorInfo aclRecurrentToInputWeightsInfo;

657

arm_compute::TensorInfo aclCellToInputWeightsInfo;

658

arm_compute::TensorInfo aclInputGateBiasInfo;

659

arm_compute::TensorInfo aclProjectionWeightsInfo;

660

arm_compute::TensorInfo aclProjectionBiasInfo;

661

arm_compute::TensorInfo aclCellToForgetWeightsInfo;

662

arm_compute::TensorInfo aclCellToOutputWeightsInfo;

663

664

arm_compute::TensorInfo aclInputLayerNormWeightsInfo;

665

arm_compute::TensorInfo aclForgetLayerNormWeightsInfo;

666

arm_compute::TensorInfo aclCellLayerNormWeightsInfo;

667

arm_compute::TensorInfo aclOutputLayerNormWeightsInfo;

668

669

670

if (!descriptor.m_CifgEnabled)

671

{

672

if (descriptor.m_PeepholeEnabled)

673

{

674

aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToInputWeights());

675

}

676

aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputToInputWeights());

677

aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToInputWeights());

678

aclInputGateBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputGateBias());

679

680

lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo,

681

&aclRecurrentToInputWeightsInfo,

682

descriptor.m_PeepholeEnabled ? &aclCellToInputWeightsInfo : nullptr,

683

&aclInputGateBiasInfo);

684

}

685

686

if (descriptor.m_ProjectionEnabled)

687

{

688

if (paramsInfo.m_ProjectionBias != nullptr)

689

{

690

aclProjectionBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionBias());

691

}

692

aclProjectionWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionWeights());

693

694

lstm_params_info.set_projection_params(&aclProjectionWeightsInfo,

695

paramsInfo.m_ProjectionBias ? &aclProjectionBiasInfo : nullptr);

696

}

697

698

if (descriptor.m_PeepholeEnabled)

699

{

700

aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToForgetWeights());

701

aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToOutputWeights());

702

703

lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo);

704

}

705

706

if (descriptor.m_LayerNormEnabled)

707

{

708

if (!descriptor.m_CifgEnabled)

709

{

710

aclInputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputLayerNormWeights());

711

}

712

aclForgetLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetForgetLayerNormWeights());

713

aclCellLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellLayerNormWeights());

714

aclOutputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetOutputLayerNormWeights());

715

716

lstm_params_info.set_layer_normalization_params(descriptor.m_CifgEnabled ? nullptr :

717

&aclInputLayerNormWeightsInfo,

718

&aclForgetLayerNormWeightsInfo,

719

&aclCellLayerNormWeightsInfo,

720

&aclOutputLayerNormWeightsInfo);

721

}

722

723

// Need to be set at negative threshold to be compatible for ACL

724

float cell_threshold = descriptor.m_ClippingThresCell;

725

float projection_threshold = descriptor.m_ClippingThresProj;

726

727

arm_compute::ActivationLayerInfo activationLayerInfo =

728

ConvertLstmActivationFuncToAclLayerInfo(descriptor.m_ActivationFunc);

729

730

for (unsigned int i = 0; i != maxTime; ++i)

731

{

732

733

// Set LSTM input and output ITensors depending on:

734

// input format (timeMajor) & number of LSTM batches (maxTime).

735

arm_compute::ITensorInfo* outputLSTM;

736

arm_compute::ITensorInfo* inputLSTM;

737

// If there is only one LSTM time major batch, we will not concat OR permute.

738

// Set input of LSTM to be first input ITensor.

739

// Set output of LSTM to be final output ITensor.

740

// LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo.

741

if (maxTime == 1 && !descriptor.m_TimeMajor)

742

{

743

TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U);

744

TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U);

745

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

746

TensorShape outputShapeShrink({outputShape[1], outputShape[2]});

747

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

748

auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink);

749

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(acl_input_shape_shrink);

750

inputLSTM = const_cast<arm_compute::TensorInfo*>(&aclInputInfo);

751

const_cast<arm_compute::TensorInfo*>(&aclOutputInfo)->set_tensor_shape(acl_output_shape_shrink);

752

outputLSTM = const_cast<arm_compute::TensorInfo*>(&aclOutputInfo);

753

}

754

// If there is only one LSTM batch major batch, we will not concat, only permute.

755

// Set input of LSTM to be output of initial permute.

756

// Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute.

757

// LSTM output cannot be > 2 dimensions so need to resize its TensorInfo.

758

else if (maxTime == 1 && !descriptor.m_TimeMajor)

759

{

760

TensorShape inputShape = GetTensorShape(aclPermuteOutInfo.tensor_shape(), 1U);

761

TensorShape inputShapeShrink({inputShape[1], inputShape[2]});

762

auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink);

763

aclPermuteOutInfo.set_tensor_shape(acl_input_shape_shrink);

764

inputLSTM = &aclPermuteOutInfo;

765

outputLSTM = const_cast<arm_compute::ITensorInfo*>(concatInputsTensorInfosPtr[i]);

766

}

767

// Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on.

768

else

769

{

770

inputLSTM = splitterOutputsTensorInfosPtr[i];

771

outputLSTM = const_cast<arm_compute::ITensorInfo*>(concatInputsTensorInfosPtr[i]);

772

}

773

774

statusLSTM = arm_compute::CLLSTMLayer::validate(inputLSTM,

775

&aclInputToForgetWeightsInfo,

776

&aclInputToCellWeightsInfo,

777

&aclInputToOutputWeightsInfo,

778

&aclRecurrentToForgetWeightsInfo,

779

&aclRecurrentToCellWeightsInfo,

780

&aclRecurrentToOutputWeightsInfo,

781

&aclForgetGateBiasInfo,

782

&aclCellBiasInfo,

783

&aclOutputGateBiasInfo,

784

&aclOutputStateInInfo,

785

&aclCellStateInInfo,

786

&aclScratchBufferInfo,

787

&aclOutputStateOutInfo,

788

&aclCellStateOutInfo,

outputLSTM,

lstm_params_info,

activationLayerInfo,

cell_threshold,

projection_threshold);

794

795

if (statusLSTM.error_code() != arm_compute::ErrorCode::OK)

{

break;

}

}

//

// Concat validate

//

// Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs.

806

TensorShape shape = GetTensorShape(concatInputsTensorInfosPtr[0]->tensor_shape(), 1U);

807

TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]});

808

TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]});

809

810

TensorInfo concatOuputTensorInfo = TensorInfo(output);

811

concatOuputTensorInfo.SetShape(timeMajorShapeOutput);

812

arm_compute::TensorInfo aclConcatOuputTensorInfo= BuildArmComputeTensorInfo(concatOuputTensorInfo);

813

814

if (maxTime != 1) // ACL concat does not work with only one element to concatenate.

815

{

816

for (unsigned int i = 0; i < maxTime; ++i)

817

{

818

auto acl_shape_expand = BuildArmComputeTensorShape(shapeExpandTimeMajor);

819

concatInputsTensorInfos[i].set_tensor_shape(acl_shape_expand);

820

}

821

822

unsigned int aclAxisConcat = CalcAclAxis(numberDimensions, dimension);

823

if (!descriptor.m_TimeMajor)

824

{

825

statusConcat = arm_compute::CLConcatenateLayer::validate(concatInputsTensorInfosPtr,

826

&aclConcatOuputTensorInfo,

aclAxisConcat);

}

else

{

statusConcat = arm_compute::CLConcatenateLayer::validate(concatInputsTensorInfosPtr,

&aclOutputInfo,

aclAxisConcat);

}

}

// If only one LSTM batch, we do not concat and/or permute.

837

// Must ensure final output info is expanded to correct batch major dimensions.

838

else

839

{

840

if (!descriptor.m_TimeMajor)

841

{

842

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(

843

BuildArmComputeTensorShape(shapeExpandBatchMajor));

}

else

{

const_cast<arm_compute::TensorInfo*>(&aclInputInfo)->set_tensor_shape(

848

BuildArmComputeTensorShape(shapeExpandTimeMajor));

}

}

//

// Permute validate

//

if (!descriptor.m_TimeMajor)

855

{

856

// Output now time major. Permute output back to batch major.

857

if (maxTime != 1)

858

{

859

statusPermute2 = arm_compute::CLPermute::validate(&aclConcatOuputTensorInfo,

860

&aclOutputInfo,

861

arm_compute::PermutationVector(0U, 2U, 1U));

}

else

{

statusPermute2 = arm_compute::CLPermute::validate(concatInputsTensorInfosPtr[0],

866

&aclOutputInfo,

867

arm_compute::PermutationVector(0U, 2U, 1U));

}

}

auto okCode = arm_compute::ErrorCode::OK;

872

if (statusPermute1.error_code() == okCode &&

873

statusSplit.error_code() == okCode &&

874

statusLSTM .error_code() == okCode &&

875

statusConcat.error_code() == okCode &&

876

statusPermute2.error_code() == okCode)

877

{

878

return arm_compute::Status(arm_compute::ErrorCode::OK,

879

"All Unidirectional Sequence LSTM layer validate status OK.");

}

else

{

return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,

884

"Unidirectional Sequence LSTM layer validate status failed.");

}

}

void ClUnidirectionalSequenceLstmFloatWorkload::FreeUnusedTensors()

889

{

890

FreeTensorIfUnused(m_InputToInputWeightsTensor);

891

FreeTensorIfUnused(m_InputToForgetWeightsTensor);

892

FreeTensorIfUnused(m_InputToCellWeightsTensor);

893

FreeTensorIfUnused(m_InputToOutputWeightsTensor);

894

FreeTensorIfUnused(m_RecurrentToInputWeightsTensor);

895

FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor);

896

FreeTensorIfUnused(m_RecurrentToCellWeightsTensor);

897

FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor);

898

FreeTensorIfUnused(m_CellToInputWeightsTensor);

899

FreeTensorIfUnused(m_CellToForgetWeightsTensor);

900

FreeTensorIfUnused(m_CellToOutputWeightsTensor);

901

FreeTensorIfUnused(m_InputGateBiasTensor);

902

FreeTensorIfUnused(m_ForgetGateBiasTensor);

903

FreeTensorIfUnused(m_CellBiasTensor);

904

FreeTensorIfUnused(m_OutputGateBiasTensor);

905

FreeTensorIfUnused(m_ProjectionWeightsTensor);

906

FreeTensorIfUnused(m_ProjectionBiasTensor);

907

FreeTensorIfUnused(m_InputLayerNormWeightsTensor);

908

FreeTensorIfUnused(m_ForgetLayerNormWeightsTensor);

909

FreeTensorIfUnused(m_CellLayerNormWeightsTensor);

910

FreeTensorIfUnused(m_OutputLayerNormWeightsTensor);

911

FreeTensorIfUnused(m_ScratchBuffer);

912

}

913

914

} //namespace armnn