Blame - src/runtime/gpu/cl/operators/ClGemm.cpp - ml/ComputeLibrary

inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

105

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)

106

{

107

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

108

TensorInfo tmp_b_info{};

109

// Validate reshape RHS kernel

110

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

111

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

116

gemm_kernel_info.lhs_info = lhs_info;

117

gemm_kernel_info.rhs_info = rhs_info;

118

gemm_kernel_info.has_pad_y = false;

119

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

gemm_kernel_info.has_pad_y = true;

124

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs

132

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,

133

const ITensorInfo *b,

134

const ITensorInfo *c, const ITensorInfo *output)

135

{

136

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);

137

if(config)

138

{

139

if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))

140

{

141

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

142

return { config.lhs_info, config.rhs_info };

143

}

144

}

145

config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);

146

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

147

return { config.lhs_info, config.rhs_info };

148

}

149

150

// Validate lhs_info and rhs_info for reshaped kernel

151

inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

152

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)

153

{

154

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel

155

TensorInfo tmp_a_info{};

156

TensorInfo tmp_b_info{};

157

158

// Validate reshape LHS kernel

159

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));

160

if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))

{

return false;

}

// Validate reshape RHS kernel

166

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

167

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

172

gemm_kernel_info.lhs_info = lhs_info;

173

gemm_kernel_info.rhs_info = rhs_info;

174

if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs

182

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,

183

const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)

184

{

185

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);

186

if(config)

187

{

188

if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))

189

{

190

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

191

return { config.lhs_info, config.rhs_info };

192

}

193

}

194

config = auto_heuristics::select_default_gemm_config_reshaped(query);

195

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

196

return { config.lhs_info, config.rhs_info };

}

} // namespace

ClGemm::ClGemm()

: _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),

202

_reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),

203

_reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),

204

_mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),

205

_mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

206

_mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

207

_tmp_a(),

208

_tmp_b(),

209

_reshape_b_only_on_first_run(false),

210

_gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),

211

_aux_mem(AuxTensorIdx::Count)

{

}

void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

216

const GEMMInfo &gemm_info)

217

{

218

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

219

const unsigned int n = b->dimension(0);

220

const unsigned int k = a->dimension(0);

221

const GPUTarget gpu_target = CLScheduler::get().target();

222

223

// Set the target for the kernels

224

_mm_kernel->set_target(gpu_target);

225

226

GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());

227

228

// Configure and tune matrix multiply kernel

229

_mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

230

231

// Tune kernel statically

232

CLScheduler::get().tune_kernel_static(*_mm_kernel);

233

}

234

235

void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

236

const GEMMInfo &gemm_info)

237

{

238

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

239

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

240

const unsigned int n = b->dimension(0);

241

const unsigned int k = a->dimension(0);

242

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

243

const GPUTarget gpu_target = CLScheduler::get().target();

244

int mult_transpose1xW_width = 1;

245

int mult_interleave4x4_height = 1;

246

247

// Set the target for the kernels

248

_reshape_lhs_kernel->set_target(gpu_target);

249

_mm_kernel->set_target(gpu_target);

250

251

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

252

{

253

mult_transpose1xW_width = 4;

254

mult_interleave4x4_height = 2;

255

}

256

257

GEMMRHSMatrixInfo rhs_info;

258

rhs_info.n0 = 16 / b->element_size();

259

rhs_info.k0 = 1;

260

rhs_info.h0 = mult_transpose1xW_width;

261

rhs_info.interleave = false;

262

rhs_info.transpose = false;

263

264

GEMMLHSMatrixInfo lhs_info;

265

lhs_info.m0 = 4;

266

lhs_info.k0 = 4;

267

lhs_info.v0 = mult_interleave4x4_height;

268

lhs_info.interleave = true;

269

lhs_info.transpose = true;

270

271

GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

272

273

// Configure interleave kernel

274

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);

275

276

// Configure transpose kernel

277

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

278

279

// Configure and tune matrix multiply kernel

280

_mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

281

282

CLScheduler::get().tune_kernel_static(*_mm_kernel);

283

284

// Request memory for LHS and RHS reshape matrix

285

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

286

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

287

}

288

289

void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

290

const GEMMInfo &gemm_info)

291

{

292

DataType data_type = a->data_type();

293

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

294

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

295

const unsigned int n = b->dimension(0);

296

const unsigned int k = a->dimension(0);

297

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

298

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

299

const GPUTarget gpu_target = CLScheduler::get().target();

300

bool broadcast_bias = gemm_info.broadcast_bias();

301

302

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

307

kernel_info.reinterpret_input_as_3d = false;

308

kernel_info.broadcast_bias = broadcast_bias;

309

kernel_info.activation_info = gemm_info.activation_info();

310

311

// Set the target for the kernels

312

_reshape_lhs_kernel->set_target(gpu_target);

313

_mm_kernel->set_target(gpu_target);

314

315

GEMMLHSMatrixInfo lhs_info{};

316

GEMMRHSMatrixInfo rhs_info{};

317

318

// Pick up the GEMM configuration

319

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,

320

c, output, gemm_info.reinterpret_input_as_3d());

321

322

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());

323

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

324

325

// Configure and tune matrix multiply kernel

326

_mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

327

328

// Request memory for LHS and RHS reshape matrix

329

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

330

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

331

}

332

333

void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

334

const GEMMInfo &gemm_info)

335

{

336

DataType data_type = a->data_type();

337

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

338

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

339

const unsigned int n = b->dimension(0);

340

const unsigned int k = a->dimension(0);

341

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

342

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

343

const GPUTarget gpu_target = CLScheduler::get().target();

344

bool broadcast_bias = gemm_info.broadcast_bias();

345

346

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

351

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

352

kernel_info.broadcast_bias = broadcast_bias;

353

kernel_info.activation_info = gemm_info.activation_info();

354

355

// Set the target for the kernels

356

_mm_kernel->set_target(gpu_target);

357

358

GEMMLHSMatrixInfo lhs_info{};

359

GEMMRHSMatrixInfo rhs_info{};

360

361

// Pick up the GEMM configuration

362

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);

363

364

// Transpose matrix

365

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

366

367

// Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)

368

// During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have

369

// pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false

370

371

// Configure matrix multiply kernel with no y padding support

372

kernel_info.has_pad_y = false;

373

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

374

375

// Configure matrix multiply kernel with y padding support

376

kernel_info.has_pad_y = true;

377

_mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

378

379

// Request memory for RHS reshape matrix

380

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

381

}

382

383

Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

384

{

385

ARM_COMPUTE_UNUSED(alpha);

386

ARM_COMPUTE_UNUSED(output);

387

388

// Get the GPU target

389

const GPUTarget gpu_target = CLScheduler::get().target();

390

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

391

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

392

const unsigned int n = b->dimension(0);

393

const unsigned int k = a->dimension(0);

394

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

395

396

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());

397

398

// Validate matrix multiply

399

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,

400

false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

406

{

407

ARM_COMPUTE_UNUSED(alpha);

408

ARM_COMPUTE_UNUSED(output);

409

410

TensorInfo tmp_a_info{};

411

TensorInfo tmp_b_info{};

412

413

// Get the GPU target

414

const GPUTarget gpu_target = CLScheduler::get().target();

415

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

416

const unsigned int n = b->dimension(0);

417

const unsigned int k = a->dimension(0);

418

int mult_transpose1xW_width = 1;

419

int mult_interleave4x4_height = 1;

420

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

421

422

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

423

{

424

mult_transpose1xW_width = 4;

425

mult_interleave4x4_height = 2;

426

}

427

428

GEMMRHSMatrixInfo rhs_info;

429

rhs_info.n0 = 16 / b->element_size();

430

rhs_info.k0 = 1;

431

rhs_info.h0 = mult_transpose1xW_width;

432

rhs_info.interleave = false;

433

rhs_info.transpose = false;

434

435

GEMMLHSMatrixInfo lhs_info;

436

lhs_info.m0 = 4;

437

lhs_info.k0 = 4;

438

lhs_info.v0 = mult_interleave4x4_height;

439

lhs_info.interleave = true;

440

lhs_info.transpose = true;

441

442

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

443

444

// Validate interleave kernel

445

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

446

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

447

448

// Validate transpose kernel

449

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

450

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

451

452

// Validate matrix multiply

453

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,

454

true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

460

{

461

ARM_COMPUTE_UNUSED(alpha);

462

ARM_COMPUTE_UNUSED(output);

463

464

TensorInfo tmp_a_info{};

465

TensorInfo tmp_b_info{};

466

467

// Get the GPU target

468

const GPUTarget gpu_target = CLScheduler::get().target();

469

DataType data_type = a->data_type();

470

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

471

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

472

const unsigned int n = b->dimension(0);

473

const unsigned int k = a->dimension(0);

474

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

475

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

476

const bool broadcast_bias = gemm_info.broadcast_bias();

477

478

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

483

kernel_info.reinterpret_input_as_3d = false;

484

kernel_info.broadcast_bias = broadcast_bias;

485

kernel_info.activation_info = gemm_info.activation_info();

486

487

GEMMLHSMatrixInfo lhs_info;

488

GEMMRHSMatrixInfo rhs_info;

489

490

// Pick up the GEMM configuration

491

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

492

const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

493

lhs_info = gemm_config.lhs_info;

494

rhs_info = gemm_config.rhs_info;

495

496

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

497

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

498

499

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

500

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

501

502

// Validate matrix multiply

503

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

509

{

510

ARM_COMPUTE_UNUSED(alpha);

511

ARM_COMPUTE_UNUSED(output);

512

513

TensorInfo tmp_b_info{};

514

515

// Get the GPU target

516

const GPUTarget gpu_target = CLScheduler::get().target();

517

const DataType data_type = a->data_type();

518

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

519

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

520

const unsigned int n = b->dimension(0);

521

const unsigned int k = a->dimension(0);

522

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

523

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

524

const bool broadcast_bias = gemm_info.broadcast_bias();

525

526

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

531

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

532

kernel_info.broadcast_bias = broadcast_bias;

533

kernel_info.activation_info = gemm_info.activation_info();

534

535

GEMMLHSMatrixInfo lhs_info;

536

GEMMRHSMatrixInfo rhs_info;

537

538

// Pick up the GEMM configuration

539

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

540

const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

541

lhs_info = gemm_config.lhs_info;

542

rhs_info = gemm_config.rhs_info;

543

544

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

545

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

546

547

// Validate matrix multiply

548

kernel_info.has_pad_y = false;

549

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

550

551

kernel_info.has_pad_y = true;

552

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

558

{

559

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

560

561

// Perform validation step

562

ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));

563

564

// Check if we need to reshape the matrix B only on the first run

565

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

566

567

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

568

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

569

const unsigned int n = b->dimension(0);

570

const unsigned int k = a->dimension(0);

571

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

572

573

// Select GEMMType

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

574

_gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,

575

gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

576

577

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

578

579

ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

580

581

switch(_gemm_kernel_type)

582

{

583

case CLGEMMKernelType::NATIVE_V1:

584

{

585

configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

586

break;

587

}

588

case CLGEMMKernelType::RESHAPED_V1:

589

{

590

configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

591

break;

592

}

593

case CLGEMMKernelType::RESHAPED:

594

{

595

configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

596

break;

597

}

598

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

599

{

600

configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

611

{

612

// Get the GPU target

613

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

614

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

615

const unsigned int n = b->dimension(0);

616

const unsigned int k = a->dimension(0);

617

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

618

619

// Select GEMMType

620

CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery

621

{

622

CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,

623

},

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

624

gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

625

626

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

627

628

const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

629

630

switch(gemm_kernel_type)

631

{

632

case CLGEMMKernelType::NATIVE_V1:

633

{

634

ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

635

break;

636

}

637

case CLGEMMKernelType::RESHAPED_V1:

638

{

639

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

640

break;

641

}

642

case CLGEMMKernelType::RESHAPED:

643

{

644

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));

645

break;

646

}

647

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

648

{

649

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));

break;

}

default:

{

ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");

}

}

return Status{};

}

void ClGemm::run(ITensorPack &tensors)

662

{

663

const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);

664

const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);

665

const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);

666

ITensor *dst = tensors.get_tensor(ACL_DST);

667

668

ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);

669

670

CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);

671

CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);

672

673

// Prepare the consts if needed

674

prepare(tensors);

675

676

// Run matrix multiply kernel

677

switch(_gemm_kernel_type)

678

{

679

case CLGEMMKernelType::NATIVE_V1:

680

{

681

CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);

682

break;

683

}

684

case CLGEMMKernelType::RESHAPED_V1:

685

case CLGEMMKernelType::RESHAPED:

686

{

687

// Run interleave kernel

688

ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };

689

CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);

690

691

if(!_reshape_b_only_on_first_run)

692

{

693

// Run transpose kernel

694

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

695

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

696

}

697

698

ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

699

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)

700

{

701

CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);

}

break;

}

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

710

{

711

if(!_reshape_b_only_on_first_run)

712

{

713

// Run transpose kernel

714

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

715

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

716

}

717

// In case of RESHAPED_ONLY_RHS, we need to check the padding requirement

718

// Check if the lhs or dst tensors have padding

719

const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;

720

const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;

721

bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);

722

723

ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

724

if(has_pad_y)

725

{

726

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);

}

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

void ClGemm::prepare(ITensorPack &constants)

742

{

743

const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);

744

ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));

745

746

// If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed

747

if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)

748

{

Georgios Pinitas

2b147ee

2021-07-08 18:14:45 +0100

[diff] [blame]

749

ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");

750

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

751

CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);

752

ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);

753

754

ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };

755

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);

}

}

experimental::MemoryRequirements ClGemm::workspace() const

{

return _aux_mem;

}

} // namespace opencl

764

} // namespace arm_compute