Blame - src/runtime/gpu/cl/operators/ClGemm.cpp - ml/ComputeLibrary

inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

105

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)

106

{

107

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

108

TensorInfo tmp_b_info{};

109

// Validate reshape RHS kernel

110

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

111

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

116

gemm_kernel_info.lhs_info = lhs_info;

117

gemm_kernel_info.rhs_info = rhs_info;

118

gemm_kernel_info.has_pad_y = false;

119

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

gemm_kernel_info.has_pad_y = true;

124

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs

132

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,

133

const ITensorInfo *b,

134

const ITensorInfo *c, const ITensorInfo *output)

135

{

136

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);

137

if(config)

138

{

139

if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))

140

{

141

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

142

return { config.lhs_info, config.rhs_info };

143

}

144

}

145

config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);

146

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

147

return { config.lhs_info, config.rhs_info };

148

}

149

150

// Validate lhs_info and rhs_info for reshaped kernel

151

inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

152

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)

153

{

154

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel

155

TensorInfo tmp_a_info{};

156

TensorInfo tmp_b_info{};

157

158

// Validate reshape LHS kernel

159

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));

160

if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))

{

return false;

}

// Validate reshape RHS kernel

166

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

167

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

172

gemm_kernel_info.lhs_info = lhs_info;

173

gemm_kernel_info.rhs_info = rhs_info;

174

if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs

182

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,

183

const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)

184

{

185

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);

186

if(config)

187

{

188

if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))

189

{

190

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

191

return { config.lhs_info, config.rhs_info };

192

}

193

}

194

config = auto_heuristics::select_default_gemm_config_reshaped(query);

195

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

196

return { config.lhs_info, config.rhs_info };

}

} // namespace

ClGemm::ClGemm()

: _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),

202

_reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),

203

_reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),

204

_mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),

205

_mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

206

_mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

207

_tmp_a(),

208

_tmp_b(),

209

_reshape_b_only_on_first_run(false),

210

_gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

211

_is_prepared(false),

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

212

_aux_mem(AuxTensorIdx::Count)

{

}

void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

217

const GEMMInfo &gemm_info)

218

{

219

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

220

const unsigned int n = b->dimension(0);

221

const unsigned int k = a->dimension(0);

222

const GPUTarget gpu_target = CLScheduler::get().target();

223

224

// Set the target for the kernels

225

_mm_kernel->set_target(gpu_target);

226

227

GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());

228

229

// Configure and tune matrix multiply kernel

230

_mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

231

232

// Tune kernel statically

233

CLScheduler::get().tune_kernel_static(*_mm_kernel);

234

}

235

236

void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

237

const GEMMInfo &gemm_info)

238

{

239

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

240

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

241

const unsigned int n = b->dimension(0);

242

const unsigned int k = a->dimension(0);

243

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

244

const GPUTarget gpu_target = CLScheduler::get().target();

245

int mult_transpose1xW_width = 1;

246

int mult_interleave4x4_height = 1;

247

248

// Set the target for the kernels

249

_reshape_lhs_kernel->set_target(gpu_target);

250

_mm_kernel->set_target(gpu_target);

251

252

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

253

{

254

mult_transpose1xW_width = 4;

255

mult_interleave4x4_height = 2;

256

}

257

258

GEMMRHSMatrixInfo rhs_info;

259

rhs_info.n0 = 16 / b->element_size();

260

rhs_info.k0 = 1;

261

rhs_info.h0 = mult_transpose1xW_width;

262

rhs_info.interleave = false;

263

rhs_info.transpose = false;

264

265

GEMMLHSMatrixInfo lhs_info;

266

lhs_info.m0 = 4;

267

lhs_info.k0 = 4;

268

lhs_info.v0 = mult_interleave4x4_height;

269

lhs_info.interleave = true;

270

lhs_info.transpose = true;

271

272

GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

273

274

// Configure interleave kernel

275

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);

276

277

// Configure transpose kernel

278

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

279

280

// Configure and tune matrix multiply kernel

281

_mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

282

283

CLScheduler::get().tune_kernel_static(*_mm_kernel);

284

285

// Request memory for LHS and RHS reshape matrix

286

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

287

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

288

}

289

290

void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

291

const GEMMInfo &gemm_info)

292

{

293

DataType data_type = a->data_type();

294

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

295

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

296

const unsigned int n = b->dimension(0);

297

const unsigned int k = a->dimension(0);

298

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

299

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

300

const GPUTarget gpu_target = CLScheduler::get().target();

301

bool broadcast_bias = gemm_info.broadcast_bias();

302

303

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

308

kernel_info.reinterpret_input_as_3d = false;

309

kernel_info.broadcast_bias = broadcast_bias;

310

kernel_info.activation_info = gemm_info.activation_info();

311

312

// Set the target for the kernels

313

_reshape_lhs_kernel->set_target(gpu_target);

314

_mm_kernel->set_target(gpu_target);

315

316

GEMMLHSMatrixInfo lhs_info{};

317

GEMMRHSMatrixInfo rhs_info{};

318

319

// Pick up the GEMM configuration

320

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,

321

c, output, gemm_info.reinterpret_input_as_3d());

322

323

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());

324

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

325

326

// Configure and tune matrix multiply kernel

327

_mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

328

329

// Request memory for LHS and RHS reshape matrix

330

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

331

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

332

}

333

334

void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

335

const GEMMInfo &gemm_info)

336

{

337

DataType data_type = a->data_type();

338

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

339

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

340

const unsigned int n = b->dimension(0);

341

const unsigned int k = a->dimension(0);

342

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

343

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

344

const GPUTarget gpu_target = CLScheduler::get().target();

345

bool broadcast_bias = gemm_info.broadcast_bias();

346

347

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

352

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

353

kernel_info.broadcast_bias = broadcast_bias;

354

kernel_info.activation_info = gemm_info.activation_info();

355

356

// Set the target for the kernels

357

_mm_kernel->set_target(gpu_target);

358

359

GEMMLHSMatrixInfo lhs_info{};

360

GEMMRHSMatrixInfo rhs_info{};

361

362

// Pick up the GEMM configuration

363

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);

364

365

// Transpose matrix

366

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

367

368

// Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)

369

// During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have

370

// pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false

371

372

// Configure matrix multiply kernel with no y padding support

373

kernel_info.has_pad_y = false;

374

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

375

376

// Configure matrix multiply kernel with y padding support

377

kernel_info.has_pad_y = true;

378

_mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

379

380

// Request memory for RHS reshape matrix

381

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

382

}

383

384

Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

385

{

386

ARM_COMPUTE_UNUSED(alpha);

387

ARM_COMPUTE_UNUSED(output);

388

389

// Get the GPU target

390

const GPUTarget gpu_target = CLScheduler::get().target();

391

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

392

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

393

const unsigned int n = b->dimension(0);

394

const unsigned int k = a->dimension(0);

395

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

396

397

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());

398

399

// Validate matrix multiply

400

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,

401

false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

407

{

408

ARM_COMPUTE_UNUSED(alpha);

409

ARM_COMPUTE_UNUSED(output);

410

411

TensorInfo tmp_a_info{};

412

TensorInfo tmp_b_info{};

413

414

// Get the GPU target

415

const GPUTarget gpu_target = CLScheduler::get().target();

416

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

417

const unsigned int n = b->dimension(0);

418

const unsigned int k = a->dimension(0);

419

int mult_transpose1xW_width = 1;

420

int mult_interleave4x4_height = 1;

421

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

422

423

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

424

{

425

mult_transpose1xW_width = 4;

426

mult_interleave4x4_height = 2;

427

}

428

429

GEMMRHSMatrixInfo rhs_info;

430

rhs_info.n0 = 16 / b->element_size();

431

rhs_info.k0 = 1;

432

rhs_info.h0 = mult_transpose1xW_width;

433

rhs_info.interleave = false;

434

rhs_info.transpose = false;

435

436

GEMMLHSMatrixInfo lhs_info;

437

lhs_info.m0 = 4;

438

lhs_info.k0 = 4;

439

lhs_info.v0 = mult_interleave4x4_height;

440

lhs_info.interleave = true;

441

lhs_info.transpose = true;

442

443

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

444

445

// Validate interleave kernel

446

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

447

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

448

449

// Validate transpose kernel

450

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

451

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

452

453

// Validate matrix multiply

454

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,

455

true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

461

{

462

ARM_COMPUTE_UNUSED(alpha);

463

ARM_COMPUTE_UNUSED(output);

464

465

TensorInfo tmp_a_info{};

466

TensorInfo tmp_b_info{};

467

468

// Get the GPU target

469

const GPUTarget gpu_target = CLScheduler::get().target();

470

DataType data_type = a->data_type();

471

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

472

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

473

const unsigned int n = b->dimension(0);

474

const unsigned int k = a->dimension(0);

475

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

476

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

477

const bool broadcast_bias = gemm_info.broadcast_bias();

478

479

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

484

kernel_info.reinterpret_input_as_3d = false;

485

kernel_info.broadcast_bias = broadcast_bias;

486

kernel_info.activation_info = gemm_info.activation_info();

487

488

GEMMLHSMatrixInfo lhs_info;

489

GEMMRHSMatrixInfo rhs_info;

490

491

// Pick up the GEMM configuration

492

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

493

const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

494

lhs_info = gemm_config.lhs_info;

495

rhs_info = gemm_config.rhs_info;

496

497

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

498

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

499

500

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

501

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

502

503

// Validate matrix multiply

504

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

510

{

511

ARM_COMPUTE_UNUSED(alpha);

512

ARM_COMPUTE_UNUSED(output);

513

514

TensorInfo tmp_b_info{};

515

516

// Get the GPU target

517

const GPUTarget gpu_target = CLScheduler::get().target();

518

const DataType data_type = a->data_type();

519

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

520

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

521

const unsigned int n = b->dimension(0);

522

const unsigned int k = a->dimension(0);

523

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

524

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

525

const bool broadcast_bias = gemm_info.broadcast_bias();

526

527

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

532

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

533

kernel_info.broadcast_bias = broadcast_bias;

534

kernel_info.activation_info = gemm_info.activation_info();

535

536

GEMMLHSMatrixInfo lhs_info;

537

GEMMRHSMatrixInfo rhs_info;

538

539

// Pick up the GEMM configuration

540

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

541

const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

542

lhs_info = gemm_config.lhs_info;

543

rhs_info = gemm_config.rhs_info;

544

545

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

546

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

547

548

// Validate matrix multiply

549

kernel_info.has_pad_y = false;

550

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

551

552

kernel_info.has_pad_y = true;

553

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

559

{

560

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

561

562

// Perform validation step

563

ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));

564

565

// Check if we need to reshape the matrix B only on the first run

566

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

567

568

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

569

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

570

const unsigned int n = b->dimension(0);

571

const unsigned int k = a->dimension(0);

572

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

573

574

// Select GEMMType

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

575

_gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,

576

gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

577

578

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

579

580

ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

581

582

switch(_gemm_kernel_type)

583

{

584

case CLGEMMKernelType::NATIVE_V1:

585

{

586

configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

587

break;

588

}

589

case CLGEMMKernelType::RESHAPED_V1:

590

{

591

configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

592

break;

593

}

594

case CLGEMMKernelType::RESHAPED:

595

{

596

configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

597

break;

598

}

599

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

600

{

601

configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

612

{

613

// Get the GPU target

614

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

615

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

616

const unsigned int n = b->dimension(0);

617

const unsigned int k = a->dimension(0);

618

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

619

620

// Select GEMMType

621

CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery

622

{

623

CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,

624

},

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

625

gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

626

627

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

628

629

const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

630

631

switch(gemm_kernel_type)

632

{

633

case CLGEMMKernelType::NATIVE_V1:

634

{

635

ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

636

break;

637

}

638

case CLGEMMKernelType::RESHAPED_V1:

639

{

640

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

641

break;

642

}

643

case CLGEMMKernelType::RESHAPED:

644

{

645

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));

646

break;

647

}

648

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

649

{

650

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));

break;

}

default:

{

ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");

}

}

return Status{};

}

void ClGemm::run(ITensorPack &tensors)

663

{

664

const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);

665

const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);

666

const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);

667

ITensor *dst = tensors.get_tensor(ACL_DST);

668

669

ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);

670

671

CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);

672

CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);

673

674

// Prepare the consts if needed

675

prepare(tensors);

676

677

// Run matrix multiply kernel

678

switch(_gemm_kernel_type)

679

{

680

case CLGEMMKernelType::NATIVE_V1:

681

{

682

CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);

683

break;

684

}

685

case CLGEMMKernelType::RESHAPED_V1:

686

case CLGEMMKernelType::RESHAPED:

687

{

688

// Run interleave kernel

689

ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };

690

CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);

691

692

if(!_reshape_b_only_on_first_run)

693

{

694

// Run transpose kernel

695

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

696

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

697

}

698

699

ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

700

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

701

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)

702

{

703

CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);

}

break;

}

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

712

{

713

if(!_reshape_b_only_on_first_run)

714

{

715

// Run transpose kernel

716

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

717

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

718

}

719

// In case of RESHAPED_ONLY_RHS, we need to check the padding requirement

720

// Check if the lhs or dst tensors have padding

721

const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;

722

const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;

723

bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);

724

725

ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

726

if(has_pad_y)

727

{

728

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);

}

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

void ClGemm::prepare(ITensorPack &constants)

744

{

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

745

if(!_is_prepared)

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

746

{

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

747

const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);

748

ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));

Georgios Pinitas

2b147ee

2021-07-08 18:14:45 +0100

[diff] [blame]

749

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

750

// If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed

751

if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)

752

{

753

ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

754

Manuel Bottini

d87aded

2021-07-16 10:23:31 +0100

[diff] [blame^]

755

CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);

756

ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);

757

758

ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };

759

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);

760

}

761

_is_prepared = true;

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

}

}

experimental::MemoryRequirements ClGemm::workspace() const

{

return _aux_mem;

}

} // namespace opencl

770

} // namespace arm_compute