Blame - src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)

89

{

90

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

91

TensorInfo mm_result_s32_info{};

92

// Output tensor auto initialization if not yet initialized

93

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));

94

// Validate mm kernel

95

// NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info

96

// NOTE: This assumes:

97

// 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).

98

// 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).

99

if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))

{

return false;

}

return true;

}

// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs

107

std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)

108

{

109

auto config = auto_heuristics::select_mlgo_gemm_config_native(query);

110

if(config)

111

{

112

if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))

113

{

114

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

115

return { config.lhs_info, config.rhs_info };

116

}

117

}

118

config = auto_heuristics::select_default_gemm_config_native(query);

119

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

120

return { config.lhs_info, config.rhs_info };

121

}

122

123

// Validate lhs_info and rhs_info for reshaped only rhs kernel

124

inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,

125

unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)

126

{

127

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

128

TensorInfo tmp_b_info{};

129

// Validate reshape RHS kernel

130

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

131

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

136

// NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info

137

// NOTE: This assumes:

138

// 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).

139

// 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).

140

GEMMKernelInfo gemm_kernel_info;

141

gemm_kernel_info.m = m;

142

gemm_kernel_info.n = n;

143

gemm_kernel_info.k = k;

144

gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

145

gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;

146

gemm_kernel_info.lhs_info = lhs_info;

147

gemm_kernel_info.rhs_info = rhs_info;

148

// Since we ignore the output stage, output data type has to be S32 to pass the validation

149

TensorInfo output_info_copy(*output);

150

output_info_copy.set_data_type(DataType::S32);

151

if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))

{

return false;

}

return true;

}

Freddie Liardet

2022-05-16 14:09:10 +0100

[diff] [blame]

158

// Validate lhs_info and rhs_info for reshaped only rhs kernel

159

inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,

160

unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)

161

{

162

// Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel

163

TensorInfo tmp_b_info{};

164

// Validate reshape RHS kernel

165

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));

166

if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))

{

return false;

}

// Validate mm kernel

171

// NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info

172

// NOTE: This assumes:

173

// 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).

174

// 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).

175

GEMMKernelInfo gemm_kernel_info;

176

gemm_kernel_info.m = m;

177

gemm_kernel_info.n = n;

178

gemm_kernel_info.k = k;

179

gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

180

gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;

181

gemm_kernel_info.lhs_info = lhs_info;

182

gemm_kernel_info.rhs_info = rhs_info;

183

// Since we ignore the output stage, output data type has to be S32 to pass the validation

184

TensorInfo output_info_copy(*output);

185

output_info_copy.set_data_type(DataType::S32);

186

if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))

{

return false;

}

return true;

}

Georgios Pinitas

2021-07-08 15:36:07 +0100

[diff] [blame]

193

// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs

194

std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,

195

const ITensorInfo *a,

196

const ITensorInfo *b, const ITensorInfo *output)

197

{

198

auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);

199

if(config)

200

{

201

if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))

202

{

203

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

204

return { config.lhs_info, config.rhs_info };

205

}

206

}

207

config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);

208

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());

209

return { config.lhs_info, config.rhs_info };

210

}

211

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

212

// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs

213

std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,

214

const ITensorInfo *a,

215

const ITensorInfo *b, const ITensorInfo *output)

216

{

217

ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);

218

auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);

219

validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);

220

ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),

221

to_string(config.rhs_info).c_str());

222

return { config.lhs_info, config.rhs_info };

223

}

224

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

225

inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)

{

switch(kernel_type)

{

case CLGEMMKernelType::NATIVE:

230

return false;

231

case CLGEMMKernelType::RESHAPED_ONLY_RHS:

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

232

case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

233

return true;

234

default:

235

ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");

}

}

} // namespace

ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()

241

: _weights_to_qasymm8(std::make_unique<ClCastKernel>()),

242

_mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),

243

_mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

244

_mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

245

_mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),

246

_mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),

247

_mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),

248

_offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),

249

_offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),

250

_aux_mem(AuxTensorIdx::Count)

{

}

ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;

255

256

void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,

257

ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,

258

const GEMMInfo &gemm_info)

259

{

260

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

261

ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));

ramelg01

2e53f17

2021-09-22 10:48:25 +0100

[diff] [blame]

262

ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

263

264

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

265

_a_offset = a->quantization_info().uniform().offset;

266

_convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())

267

&& a->data_type() == DataType::QASYMM8;

268

_b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;

269

_gemm_info = gemm_info;

270

271

// Get the GPU target

272

const GPUTarget gpu_target = CLScheduler::get().target();

273

274

// Set the target for the kernels

275

_mm_native_kernel->set_target(gpu_target);

276

_mm_reshaped_only_rhs_kernel->set_target(gpu_target);

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

277

_mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

278

279

GEMMRHSMatrixInfo rhs_info;

280

GEMMLHSMatrixInfo lhs_info;

281

282

// Arguments used by GEMMReshapeInfo

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

283

// in order to know how the matrices have been reshaped

284

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

285

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

286

const unsigned int n = b->dimension(0);

287

const unsigned int k = a->dimension(0);

288

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

289

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

290

291

const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);

292

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

293

_gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

294

295

if(_convert_to_qasymm8)

296

{

297

// Set data type for converted weights

298

_qasymm8_weights = *b;

299

_qasymm8_weights.set_data_type(DataType::QASYMM8);

300

_weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);

301

}

302

303

ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

304

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

{

matrix_b = &_tmp_b;

// Pick up the GEMM configuration

309

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

310

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,

311

depth_output_gemm3d,

312

a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);

313

314

// Configure reshape RHS kernel

315

_mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);

316

}

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

317

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)

{

matrix_b = &_tmp_b;

// Pick up the GEMM configuration

322

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

323

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,

324

depth_output_gemm3d,

325

a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);

326

327

// Configure reshape RHS kernel

328

_mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);

329

}

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

330

331

// Using default reduction info

332

const GEMMLowpReductionKernelInfo reduction_info {};

333

334

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

335

if(_a_offset != 0)

336

{

337

_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

338

339

// Configure Matrix B reduction kernel

340

_mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);

341

}

342

343

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

344

if(_b_offset != 0)

345

{

346

_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

347

348

// Configure matrix A reduction kernel

349

_mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);

350

}

351

352

GEMMKernelInfo gemm_kernel_info;

353

gemm_kernel_info.m = m;

354

gemm_kernel_info.n = n;

355

gemm_kernel_info.k = k;

356

gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;

357

gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

358

gemm_kernel_info.lhs_info = lhs_info;

359

gemm_kernel_info.rhs_info = rhs_info;

360

gemm_kernel_info.a_offset = _a_offset;

361

gemm_kernel_info.b_offset = _b_offset;

362

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

363

if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

364

{

365

// Configure offset contribution kernel

366

const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;

367

368

_gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);

369

_gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32);

370

371

GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();

372

gemmlowp_output_stage.output_data_type = a->data_type();

373

if(num_filters == 1)

374

{

375

// Per-channel quantization with OFM == 1 is equivalent to uniform quantization.

376

// Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts

377

gemmlowp_output_stage.is_quantized_per_channel = false;

378

}

379

380

gemm_kernel_info.output_stage = gemmlowp_output_stage;

381

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

382

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

383

{

384

// Configure and tune matrix multiply kernel with fused output stage

385

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,

386

_b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);

387

}

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

388

else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

389

{

390

// Configure and tune matrix multiply kernel with fused output stage

391

_mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,

392

_b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);

393

}

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

394

else

395

{

396

_run_output_stage = true;

397

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

398

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

399

{

400

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);

401

}

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

402

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)

403

{

404

_mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);

405

}

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

406

else

407

{

408

// Pick up the GEMM configuration

409

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

410

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },

411

a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);

412

413

// Configure matrix multiply kernel

414

_mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);

415

416

_offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,

417

c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,

418

&_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);

}

}

}

else

{

_run_offset_contribution = true;

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

425

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

426

{

427

// Configure and tune matrix multiply kernel

428

_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);

429

}

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

430

else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)

431

{

432

// Configure and tune matrix multiply kernel

433

_mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);

434

}

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

435

else

436

{

437

// Pick up the GEMM configuration

438

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

439

std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },

440

a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);

441

442

// Configure matrix multiply kernel

443

_mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);

444

}

445

446

// Configure offset contribution kernel

447

_offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,

448

c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);

}

// Request memory

_aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

453

if(is_gemm_reshaped(_gemm_kernel_type))

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

454

{

455

// Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation

456

_aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());

457

_aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());

}

if(_a_offset != 0)

{

_aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());

}

if(_b_offset != 0)

{

_aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());

466

}

467

_aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());

468

_aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());

469

_aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());

470

}

471

472

Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)

473

{

474

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

475

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);

476

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);

477

ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);

478

ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);

479

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

480

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

481

482

int32_t a_offset = a->quantization_info().uniform().offset;

483

int32_t b_offset = b->quantization_info().uniform().offset;

484

485

const ITensorInfo *matrix_a_info = a;

486

487

TensorInfo tmp_b_info{};

488

GEMMRHSMatrixInfo rhs_info;

489

GEMMLHSMatrixInfo lhs_info;

490

491

// Get the GPU target

492

const GPUTarget gpu_target = CLScheduler::get().target();

493

494

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

495

const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

496

const unsigned int n = b->dimension(0);

497

const unsigned int k = a->dimension(0);

498

const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);

499

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

500

501

bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));

502

503

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);

504

505

bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())

506

&& is_data_type_quantized_asymmetric(a->data_type());

507

TensorInfo weights_info(*b);

508

if(convert_to_qasymm8)

509

{

510

b_offset = -128;

511

weights_info.set_data_type(DataType::QASYMM8);

512

ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));

513

}

514

const ITensorInfo *matrix_b_info = &weights_info;

515

if(reshape_matrix_b)

516

{

517

matrix_b_info = &tmp_b_info;

518

519

// Pick up the GEMM configuration

520

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

521

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

522

const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });

523

lhs_info = res.lhs_info;

524

rhs_info = res.rhs_info;

525

526

// Validate reshape RHS kernel

527

auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));

528

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));

529

}

530

531

TensorInfo info_vector_sum_col{};

532

TensorInfo info_vector_sum_row{};

533

534

const GEMMLowpReductionKernelInfo reduction_info;

535

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

536

if(a_offset != 0)

537

{

538

info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);

539

540

// Configure Matrix B reduction kernel

541

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));

542

}

543

544

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

545

if(b_offset != 0)

546

{

547

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

548

549

// Configure matrix A reduction kernel

550

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));

551

}

552

553

GEMMKernelInfo gemm_kernel_info;

554

gemm_kernel_info.m = m;

555

gemm_kernel_info.n = n;

556

gemm_kernel_info.k = k;

557

gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;

558

gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;

559

gemm_kernel_info.lhs_info = lhs_info;

560

gemm_kernel_info.rhs_info = rhs_info;

561

gemm_kernel_info.a_offset = a_offset;

562

gemm_kernel_info.b_offset = b_offset;

563

if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

564

{

565

const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;

566

567

const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));

568

569

GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();

570

gemmlowp_output_stage.output_data_type = a->data_type();

571

572

gemm_kernel_info.output_stage = gemmlowp_output_stage;

573

if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

574

{

575

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,

576

a_offset == 0 ? nullptr : &info_vector_sum_col,

577

b_offset == 0 ? nullptr : &info_vector_sum_row,

578

c,

579

&gemm_output_stage_multipliers_shifts_info,

580

&gemm_output_stage_multipliers_shifts_info));

}

else

{

TensorInfo mm_result_s32_info{};

if(reshape_matrix_b)

{

// Output tensor auto inizialitation if not yet initialized

589

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));

590

591

// Validate matrix multiply

592

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));

}

else

{

// Output tensor auto inizialitation if not yet initialized

597

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));

598

599

// Pick up the GEMM configuration

600

// NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails

601

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

602

const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });

603

lhs_info = res.lhs_info;

604

rhs_info = res.rhs_info;

605

606

// Validate matrix multiply

607

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));

608

}

609

610

// Validate offset contribution kernel

611

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,

612

a_offset == 0 ? nullptr : &info_vector_sum_col,

613

b_offset == 0 ? nullptr : &info_vector_sum_row,

c,

output,

a_offset, b_offset,

gemmlowp_output_stage,

618

&gemm_output_stage_multipliers_shifts_info,

619

&gemm_output_stage_multipliers_shifts_info));

}

}

else

{

if(reshape_matrix_b)

{

// Validate matrix multiply

627

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));

}

else

{

// Pick up the GEMM configuration

632

// It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration

633

const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });

634

lhs_info = res.lhs_info;

635

rhs_info = res.rhs_info;

636

637

// Validate matrix multiply

638

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));

639

}

640

641

if(output->total_size() != 0)

642

{

643

// Validate offset contribution kernel

644

ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,

645

a_offset == 0 ? nullptr : &info_vector_sum_col,

646

b_offset == 0 ? nullptr : &info_vector_sum_row,

647

c,

648

a_offset, b_offset));

}

}

return Status{};

}

void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)

656

{

657

const ITensor *a = tensors.get_const_tensor(ACL_SRC_0);

658

const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);

659

const ITensor *c = tensors.get_const_tensor(ACL_SRC_2);

660

ITensor *dst = tensors.get_tensor(ACL_DST);

661

662

ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);

663

664

CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);

665

CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);

666

CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);

667

CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);

668

CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);

669

CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);

670

CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);

671

672

// Prepare the consts if needed

673

prepare(tensors);

674

675

const ITensor *matrix_a = a;

676

const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;

677

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

678

if(is_gemm_reshaped(_gemm_kernel_type))

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

679

{

680

matrix_b = tmp_b.get();

681

if(!_reshape_b_only_on_first_run)

682

{

683

// Run reshape matrix B

684

ITensorPack mtx_b_reshape_pack =

685

{

686

{ TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },

687

{ TensorType::ACL_DST, tmp_b.get() }

688

};

689

CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);

}

}

// Run matrix B reduction kernel only if _a_offset is not equal to 0

694

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

695

{

696

ITensorPack mtx_b_red_pack =

697

{

698

{ TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },

699

{ TensorType::ACL_DST, vec_sum_col.get() }

700

};

701

CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);

702

}

703

704

// Run matrix A reduction kernel only if _b_offset is not equal to 0

705

if(_b_offset != 0)

706

{

707

ITensorPack mtx_a_red_pack =

708

{

709

{ TensorType::ACL_SRC, matrix_a },

710

{ TensorType::ACL_DST, vec_sum_row.get() }

711

};

712

CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);

713

}

714

715

// Run matrix multiply

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

716

if(is_gemm_reshaped(_gemm_kernel_type))

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

717

{

718

ITensorPack gemm_reshaped_pack;

719

if(_run_offset_contribution)

720

{

721

gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },

722

{ TensorType::ACL_SRC_1, matrix_b },

723

{ TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }

});

}

else

{

gemm_reshaped_pack = ITensorPack(

729

{

730

{ TensorType::ACL_SRC, matrix_a },

731

{ TensorType::ACL_SRC_1, matrix_b },

732

{ TensorType::ACL_BIAS, c },

733

{ TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },

734

{ TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },

735

{ TensorType::ACL_SHIFTS, shifts.get() },

736

{ TensorType::ACL_MULTIPLIERS, multipliers.get() },

737

{ TensorType::ACL_DST, dst },

738

});

739

}

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

740

if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)

741

{

742

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);

743

}

744

else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)

745

{

746

CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);

}

else

{

ARM_COMPUTE_ERROR("Invalid reshaped kernel");

751

}

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

}

else

{

ITensorPack gemm_native_pack =

756

{

757

{ TensorType::ACL_SRC_0, matrix_a },

758

{ TensorType::ACL_SRC_1, matrix_b },

759

{ TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }

760

};

761

CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);

762

}

763

if(_run_output_stage)

764

{

765

// Run offset contribution/output stage kernel

766

ITensorPack output_stage_pack =

767

{

768

{ TensorType::ACL_SRC, res32.get() },

769

{ TensorType::ACL_BIAS, c },

770

{ TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },

771

{ TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },

772

{ TensorType::ACL_SHIFTS, shifts.get() },

773

{ TensorType::ACL_MULTIPLIERS, multipliers.get() },

774

{ TensorType::ACL_DST, dst },

775

};

776

CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);

777

}

778

if(_run_offset_contribution)

779

{

780

// Run offset contribution kernel

781

ITensorPack offset_contrib_pack =

782

{

783

{ TensorType::ACL_SRC_DST, dst },

784

{ TensorType::ACL_BIAS, c },

785

{ TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },

786

{ TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }

787

};

788

CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);

}

}

void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)

{

if(!_is_prepared)

{

auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

797

CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);

798

CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);

799

CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);

800

801

ARM_COMPUTE_ERROR_ON_NULLPTR(b);

802

803

if(_convert_to_qasymm8)

804

{

805

ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };

806

CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);

Georgios Pinitas

9805583

2021-07-27 10:34:59 +0100

[diff] [blame]

807

b->mark_as_unused();

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

808

}

809

Freddie Liardet

e572dff

2022-05-16 14:09:10 +0100

[diff] [blame]

810

if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

811

{

Georgios Pinitas

f4e84fb

2021-07-08 15:36:07 +0100

[diff] [blame]

812

// Run reshape kernel and mark original weights tensor as unused

813

ITensorPack mtx_b_pack =

814

{

815

{ TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },

816

{ TensorType::ACL_DST, tmp_b.get() }

817

};

818

CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);

b->mark_as_unused();

}

// Run matrix B reduction kernel only if _a_offset is not equal to 0

823

if(_a_offset != 0 && _reshape_b_only_on_first_run)

824

{

825

ITensorPack mtx_b_red_pack =

826

{

827

{ TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },

828

{ TensorType::ACL_DST, vec_sum_col.get() }

829

};

830

CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);

831

}

832

833

// Compute GEMM output multipliers and shifts for output stage

834

{

835

const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;

836

837

CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);

838

CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);

839

840

ICLTensor *multiplier_tensor = multipliers.get();

841

if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)

842

{

843

multiplier_tensor->map(CLScheduler::get().queue(), true);

844

std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));

845

multiplier_tensor->unmap(CLScheduler::get().queue());

846

}

847

848

ICLTensor *shifts_tensor = shifts.get();

849