Blame - src/runtime/gpu/cl/operators/ClGemm.cpp - ml/ComputeLibrary

inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

103

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)

104

{

105

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

106

TensorInfo tmp_b_info{};

107

// Validate reshape RHS kernel

108

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

109

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

114

gemm_kernel_info.lhs_info = lhs_info;

115

gemm_kernel_info.rhs_info = rhs_info;

116

gemm_kernel_info.has_pad_y = false;

117

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

gemm_kernel_info.has_pad_y = true;

122

if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs

130

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,

131

const ITensorInfo *b,

132

const ITensorInfo *c, const ITensorInfo *output)

133

{

134

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);

135

if(config)

136

{

137

if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))

138

{

139

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

140

return { config.lhs_info, config.rhs_info };

141

}

142

}

143

config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);

144

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

145

return { config.lhs_info, config.rhs_info };

146

}

147

148

// Validate lhs_info and rhs_info for reshaped kernel

149

inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,

150

const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)

151

{

152

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel

153

TensorInfo tmp_a_info{};

154

TensorInfo tmp_b_info{};

155

156

// Validate reshape LHS kernel

157

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));

158

if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))

{

return false;

}

// Validate reshape RHS kernel

164

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

165

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

170

gemm_kernel_info.lhs_info = lhs_info;

171

gemm_kernel_info.rhs_info = rhs_info;

172

if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))

{

return false;

}

return true;

}

//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs

180

inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,

181

const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)

182

{

183

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);

184

if(config)

185

{

186

if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))

187

{

188

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

189

return { config.lhs_info, config.rhs_info };

190

}

191

}

192

config = auto_heuristics::select_default_gemm_config_reshaped(query);

193

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

194

return { config.lhs_info, config.rhs_info };

}

} // namespace

ClGemm::ClGemm()

: _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),

200

_reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),

201

_reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),

202

_mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),

203

_mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

204

_mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),

205

_tmp_a(),

206

_tmp_b(),

207

_reshape_b_only_on_first_run(false),

208

_gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),

209

_aux_mem(AuxTensorIdx::Count)

{

}

void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

214

const GEMMInfo &gemm_info)

215

{

216

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

217

const unsigned int n = b->dimension(0);

218

const unsigned int k = a->dimension(0);

219

const GPUTarget gpu_target = CLScheduler::get().target();

220

221

// Set the target for the kernels

222

_mm_kernel->set_target(gpu_target);

223

224

GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());

225

226

// Configure and tune matrix multiply kernel

227

_mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

228

229

// Tune kernel statically

230

CLScheduler::get().tune_kernel_static(*_mm_kernel);

231

}

232

233

void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

234

const GEMMInfo &gemm_info)

235

{

236

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

237

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

238

const unsigned int n = b->dimension(0);

239

const unsigned int k = a->dimension(0);

240

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

241

const GPUTarget gpu_target = CLScheduler::get().target();

242

int mult_transpose1xW_width = 1;

243

int mult_interleave4x4_height = 1;

244

245

// Set the target for the kernels

246

_reshape_lhs_kernel->set_target(gpu_target);

247

_mm_kernel->set_target(gpu_target);

248

249

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

250

{

251

mult_transpose1xW_width = 4;

252

mult_interleave4x4_height = 2;

253

}

254

255

GEMMRHSMatrixInfo rhs_info;

256

rhs_info.n0 = 16 / b->element_size();

257

rhs_info.k0 = 1;

258

rhs_info.h0 = mult_transpose1xW_width;

259

rhs_info.interleave = false;

260

rhs_info.transpose = false;

261

262

GEMMLHSMatrixInfo lhs_info;

263

lhs_info.m0 = 4;

264

lhs_info.k0 = 4;

265

lhs_info.v0 = mult_interleave4x4_height;

266

lhs_info.interleave = true;

267

lhs_info.transpose = true;

268

269

GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

270

271

// Configure interleave kernel

272

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);

273

274

// Configure transpose kernel

275

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

276

277

// Configure and tune matrix multiply kernel

278

_mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());

279

280

CLScheduler::get().tune_kernel_static(*_mm_kernel);

281

282

// Request memory for LHS and RHS reshape matrix

283

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

284

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

285

}

286

287

void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

288

const GEMMInfo &gemm_info)

289

{

290

DataType data_type = a->data_type();

291

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

292

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

293

const unsigned int n = b->dimension(0);

294

const unsigned int k = a->dimension(0);

295

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

296

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

297

const GPUTarget gpu_target = CLScheduler::get().target();

298

bool broadcast_bias = gemm_info.broadcast_bias();

299

300

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

305

kernel_info.reinterpret_input_as_3d = false;

306

kernel_info.broadcast_bias = broadcast_bias;

307

kernel_info.activation_info = gemm_info.activation_info();

308

309

// Set the target for the kernels

310

_reshape_lhs_kernel->set_target(gpu_target);

311

_mm_kernel->set_target(gpu_target);

312

313

GEMMLHSMatrixInfo lhs_info{};

314

GEMMRHSMatrixInfo rhs_info{};

315

316

// Pick up the GEMM configuration

317

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,

318

c, output, gemm_info.reinterpret_input_as_3d());

319

320

_reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());

321

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

322

323

// Configure and tune matrix multiply kernel

324

_mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

325

326

// Request memory for LHS and RHS reshape matrix

327

_aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());

328

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

329

}

330

331

void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,

332

const GEMMInfo &gemm_info)

333

{

334

DataType data_type = a->data_type();

335

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

336

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

337

const unsigned int n = b->dimension(0);

338

const unsigned int k = a->dimension(0);

339

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

340

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

341

const GPUTarget gpu_target = CLScheduler::get().target();

342

bool broadcast_bias = gemm_info.broadcast_bias();

343

344

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

349

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

350

kernel_info.broadcast_bias = broadcast_bias;

351

kernel_info.activation_info = gemm_info.activation_info();

352

353

// Set the target for the kernels

354

_mm_kernel->set_target(gpu_target);

355

356

GEMMLHSMatrixInfo lhs_info{};

357

GEMMRHSMatrixInfo rhs_info{};

358

359

// Pick up the GEMM configuration

360

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);

361

362

// Transpose matrix

363

_reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);

364

365

// Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)

366

// During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have

367

// pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false

368

369

// Configure matrix multiply kernel with no y padding support

370

kernel_info.has_pad_y = false;

371

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

372

373

// Configure matrix multiply kernel with y padding support

374

kernel_info.has_pad_y = true;

375

_mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);

376

377

// Request memory for RHS reshape matrix

378

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

379

}

380

381

Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

382

{

383

ARM_COMPUTE_UNUSED(alpha);

384

ARM_COMPUTE_UNUSED(output);

385

386

// Get the GPU target

387

const GPUTarget gpu_target = CLScheduler::get().target();

388

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

389

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

390

const unsigned int n = b->dimension(0);

391

const unsigned int k = a->dimension(0);

392

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

393

394

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());

395

396

// Validate matrix multiply

397

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,

398

false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

404

{

405

ARM_COMPUTE_UNUSED(alpha);

406

ARM_COMPUTE_UNUSED(output);

407

408

TensorInfo tmp_a_info{};

409

TensorInfo tmp_b_info{};

410

411

// Get the GPU target

412

const GPUTarget gpu_target = CLScheduler::get().target();

413

const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

414

const unsigned int n = b->dimension(0);

415

const unsigned int k = a->dimension(0);

416

int mult_transpose1xW_width = 1;

417

int mult_interleave4x4_height = 1;

418

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

419

420

if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)

421

{

422

mult_transpose1xW_width = 4;

423

mult_interleave4x4_height = 2;

424

}

425

426

GEMMRHSMatrixInfo rhs_info;

427

rhs_info.n0 = 16 / b->element_size();

428

rhs_info.k0 = 1;

429

rhs_info.h0 = mult_transpose1xW_width;

430

rhs_info.interleave = false;

431

rhs_info.transpose = false;

432

433

GEMMLHSMatrixInfo lhs_info;

434

lhs_info.m0 = 4;

435

lhs_info.k0 = 4;

436

lhs_info.v0 = mult_interleave4x4_height;

437

lhs_info.interleave = true;

438

lhs_info.transpose = true;

439

440

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());

441

442

// Validate interleave kernel

443

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

444

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

445

446

// Validate transpose kernel

447

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

448

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

449

450

// Validate matrix multiply

451

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,

452

true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));

return Status{};

}

Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

458

{

459

ARM_COMPUTE_UNUSED(alpha);

460

ARM_COMPUTE_UNUSED(output);

461

462

TensorInfo tmp_a_info{};

463

TensorInfo tmp_b_info{};

464

465

// Get the GPU target

466

const GPUTarget gpu_target = CLScheduler::get().target();

467

DataType data_type = a->data_type();

468

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

469

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

470

const unsigned int n = b->dimension(0);

471

const unsigned int k = a->dimension(0);

472

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

473

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

474

const bool broadcast_bias = gemm_info.broadcast_bias();

475

476

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

481

kernel_info.reinterpret_input_as_3d = false;

482

kernel_info.broadcast_bias = broadcast_bias;

483

kernel_info.activation_info = gemm_info.activation_info();

484

485

GEMMLHSMatrixInfo lhs_info;

486

GEMMRHSMatrixInfo rhs_info;

487

488

// Pick up the GEMM configuration

489

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

490

const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

491

lhs_info = gemm_config.lhs_info;

492

rhs_info = gemm_config.rhs_info;

493

494

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));

495

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));

496

497

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

498

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

499

500

// Validate matrix multiply

501

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

507

{

508

ARM_COMPUTE_UNUSED(alpha);

509

ARM_COMPUTE_UNUSED(output);

510

511

TensorInfo tmp_b_info{};

512

513

// Get the GPU target

514

const GPUTarget gpu_target = CLScheduler::get().target();

515

const DataType data_type = a->data_type();

516

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

517

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

518

const unsigned int n = b->dimension(0);

519

const unsigned int k = a->dimension(0);

520

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

521

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

522

const bool broadcast_bias = gemm_info.broadcast_bias();

523

524

GEMMKernelInfo kernel_info;

kernel_info.m = m;

kernel_info.n = n;

kernel_info.k = k;

kernel_info.depth_output_gemm3d = depth_output_gemm3d;

529

kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

530

kernel_info.broadcast_bias = broadcast_bias;

531

kernel_info.activation_info = gemm_info.activation_info();

532

533

GEMMLHSMatrixInfo lhs_info;

534

GEMMRHSMatrixInfo rhs_info;

535

536

// Pick up the GEMM configuration

537

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

538

const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });

539

lhs_info = gemm_config.lhs_info;

540

rhs_info = gemm_config.rhs_info;

541

542

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

543

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));

544

545

// Validate matrix multiply

546

kernel_info.has_pad_y = false;

547

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

548

549

kernel_info.has_pad_y = true;

550

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));

return Status{};

}

void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

556

{

557

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

558

559

// Perform validation step

560

ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));

561

562

// Check if we need to reshape the matrix B only on the first run

563

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

564

565

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

566

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

567

const unsigned int n = b->dimension(0);

568

const unsigned int k = a->dimension(0);

569

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

570

571

// Select GEMMType

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

572

_gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,

573

gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

574

575

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

576

577

ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

578

579

switch(_gemm_kernel_type)

580

{

581

case CLGEMMKernelType::NATIVE_V1:

582

{

583

configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

584

break;

585

}

586

case CLGEMMKernelType::RESHAPED_V1:

587

{

588

configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

589

break;

590

}

591

case CLGEMMKernelType::RESHAPED:

592

{

593

configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

594

break;

595

}

596

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

597

{

598

configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)

609

{

610

// Get the GPU target

611

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

612

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

613

const unsigned int n = b->dimension(0);

614

const unsigned int k = a->dimension(0);

615

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

616

617

// Select GEMMType

618

CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery

619

{

620

CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,

621

},

Giorgio Arena

4403ed3

2021-05-17 13:03:50 +0100

[diff] [blame]

622

gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

623

624

const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);

625

626

const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;

627

628

switch(gemm_kernel_type)

629

{

630

case CLGEMMKernelType::NATIVE_V1:

631

{

632

ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

633

break;

634

}

635

case CLGEMMKernelType::RESHAPED_V1:

636

{

637

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));

638

break;

639

}

640

case CLGEMMKernelType::RESHAPED:

641

{

642

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));

643

break;

644

}

645

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

646

{

647

ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));

break;

}

default:

{

ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");

}

}

return Status{};

}

void ClGemm::run(ITensorPack &tensors)

660

{

661

const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);

662

const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);

663

const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);

664

ITensor *dst = tensors.get_tensor(ACL_DST);

665

666

ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);

667

668

CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);

669

CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);

670

671

// Prepare the consts if needed

672

prepare(tensors);

673

674

// Run matrix multiply kernel

675

switch(_gemm_kernel_type)

676

{

677

case CLGEMMKernelType::NATIVE_V1:

678

{

679

CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);

680

break;

681

}

682

case CLGEMMKernelType::RESHAPED_V1:

683

case CLGEMMKernelType::RESHAPED:

684

{

685

// Run interleave kernel

686

ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };

687

CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);

688

689

if(!_reshape_b_only_on_first_run)

690

{

691

// Run transpose kernel

692

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

693

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

694

}

695

696

ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

697

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)

698

{

699

CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);

}

break;

}

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

708

{

709

if(!_reshape_b_only_on_first_run)

710

{

711

// Run transpose kernel

712

ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };

713

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);

714

}

715

// In case of RESHAPED_ONLY_RHS, we need to check the padding requirement

716

// Check if the lhs or dst tensors have padding

717

const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;

718

const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;

719

bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);

720

721

ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };

722

if(has_pad_y)

723

{

724

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);

}

else

{

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);

}

break;

}

default:

{

ARM_COMPUTE_ERROR("GEMMType not supported");

}

}

}

void ClGemm::prepare(ITensorPack &constants)

740

{

741

const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);

742

ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));

743

744

// If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed

745

if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)

746

{

747

CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);

748

ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);

749

750

ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };

751

CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);

}

}

experimental::MemoryRequirements ClGemm::workspace() const

{

return _aux_mem;

}

} // namespace opencl

760

} // namespace arm_compute