Blame - src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

92

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

93

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

94

ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

95

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

96

_is_prepared = false;

97

_original_b = b;

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

98

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

99

_a_offset = a->info()->quantization_info().offset;

100

_b_offset = b->info()->quantization_info().offset;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

101

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

102

// Get the GPU target

103

const GPUTarget gpu_target = CLScheduler::get().target();

Gian Marco

7b4d547

2018-01-10 15:56:30 +0000

[diff] [blame]

104

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

105

// Set the target for the kernels

106

_mtx_a_reshape_kernel.set_target(gpu_target);

107

_mm_kernel.set_target(gpu_target);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

108

109

const ICLTensor *matrix_a = a;

110

const ICLTensor *matrix_b = b;

111

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

112

// Arguments used by GEMMReshapeInfo

113

// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo

114

// in order to know how the matrices have been reshaped

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

115

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

116

const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

117

const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

118

const int n = b->info()->dimension(0);

119

const int k = a->info()->dimension(0);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

120

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

121

constexpr int mult_transpose1xW_width = 1;

122

constexpr int mult_interleave4x4_height = 1;

123

124

// Check if we need to reshape the matrix A and matrix B

125

_is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);

126

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

127

if(_is_interleaved_transposed)

128

{

Isabella Gottardi

f02e527

2018-10-01 12:26:28 +0100

[diff] [blame]

129

// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

130

reinterpret_input_as_3d = false;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

131

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

135

_memory_group.manage(&_tmp_a);

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

136

if(!_reshape_b_only_on_first_run)

137

{

138

_memory_group.manage(&_tmp_b);

139

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

140

141

// Configure interleave kernel

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

142

_mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

143

144

// Configure transpose kernel

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

145

_mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

146

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

147

148

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

149

if(_a_offset != 0)

150

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

151

TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

152

_vector_sum_col.allocator()->init(info_vector_sum_col);

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

153

if(!_reshape_b_only_on_first_run)

154

{

155

_memory_group.manage(&_vector_sum_col);

156

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

157

158

// Configure Matrix B reduction kernel

159

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col);

160

}

161

162

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

163

if(_b_offset != 0)

164

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

165

TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

166

_vector_sum_row.allocator()->init(info_vector_sum_row);

167

_memory_group.manage(&_vector_sum_row);

168

169

// Configure matrix A reduction kernel

170

_mtx_a_reduction_kernel.configure(a, &_vector_sum_row);

171

}

172

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

173

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

174

if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

175

{

176

_fuse_output_stage = true;

177

178

_memory_group.manage(&_mm_result_s32);

179

180

// Configure matrix multiply kernel

181

_mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,

182

mult_transpose1xW_width, mult_interleave4x4_height,

183

depth_output_gemm3d, reinterpret_input_as_3d));

184

185

// Configure offset contribution kernel

186

_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),

187

_a_offset, _b_offset, gemm_info.gemmlowp_output_stage());

188

189

_mm_result_s32.allocator()->allocate();

}

else

{

// Configure matrix multiply kernel

194

_mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,

195

mult_transpose1xW_width, mult_interleave4x4_height,

196

depth_output_gemm3d, reinterpret_input_as_3d));

197

198

// Configure offset contribution kernel

199

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);

200

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

201

202

// Allocate tensors

203

if(_is_interleaved_transposed)

204

{

205

_tmp_a.allocator()->allocate();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

206

if(!_reshape_b_only_on_first_run)

207

{

208

_tmp_b.allocator()->allocate();

209

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

210

}

211

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

212

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

213

{

214

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

}

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

223

Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

224

{

225

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

226

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

227

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

228

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

229

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

230

int32_t a_offset = a->quantization_info().offset;

231

int32_t b_offset = b->quantization_info().offset;

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

232

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

233

const ITensorInfo *matrix_a_info = a;

234

const ITensorInfo *matrix_b_info = b;

235

236

TensorInfo tmp_a_info{};

237

TensorInfo tmp_b_info{};

238

Georgios Pinitas

932491f

2018-09-21 16:33:15 +0100

[diff] [blame]

239

bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

240

const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

241

const int n = b->dimension(0);

242

const int k = a->dimension(0);

243

constexpr int mult_transpose1xW_width = 1;

244

constexpr int mult_interleave4x4_height = 1;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

245

const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

246

247

bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());

248

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

249

// if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D

250

if(reshape_matrices)

251

{

252

reinterpret_input_as_3d = false;

253

}

254

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

255

const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

256

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

257

if(reshape_matrices)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

258

{

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

259

matrix_a_info = &tmp_a_info;

260

matrix_b_info = &tmp_b_info;

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

261

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

262

// Validate interleave kernel

263

auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));

264

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));

265

266

// Validate transpose kernel

267

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));

Isabella Gottardi

089695f

2018-10-17 18:04:15 +0100

[diff] [blame]

268

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

269

}

Isabella Gottardi

2018-10-11 19:14:55 +0100

[diff] [blame]

270

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

271

TensorInfo info_vector_sum_col, info_vector_sum_row;

272

273

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

274

if(a_offset != 0)

275

{

276

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

277

278

// Configure Matrix B reduction kernel

279

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));

280

}

281

282

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

283

if(b_offset != 0)

284

{

285

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

286

287

// Configure matrix A reduction kernel

288

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));

289

}

290

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

291

if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

292

{

293

TensorInfo mm_result_s32_info{};

294

295

// Output tensor auto inizialitation if not yet initialized

296

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));

297

298

// Validate matrix multiply

299

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));

300

301

// Validate offset contribution kernel

302

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,

303

a_offset == 0 ? nullptr : &info_vector_sum_col,

304

b_offset == 0 ? nullptr : &info_vector_sum_row,

c,

output,

a_offset, b_offset,

gemm_info.gemmlowp_output_stage()));

}

else

{

// Validate matrix multiply

313

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));

314

315

// Validate offset contribution kernel

316

ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,

317

a_offset == 0 ? nullptr : &info_vector_sum_col,

318

b_offset == 0 ? nullptr : &info_vector_sum_row,

319

c,

320

a_offset, b_offset));

321

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

return Status{};

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

326

void CLGEMMLowpMatrixMultiplyCore::run()

327

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

328

prepare();

329

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

330

_memory_group.acquire();

331

332

if(_is_interleaved_transposed)

333

{

334

// Run reshape matrix A

335

CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);

336

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

337

if(!_reshape_b_only_on_first_run)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

338

{

339

// Run reshape matrix B

340

CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);

}

}

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

344

// Run matrix B reduction kernel only if _a_offset is not equal to 0

345

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

346

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

347

CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

348

}

349

350

// Run matrix multiply

351

CLScheduler::get().enqueue(_mm_kernel, false);

352

353

// Run matrix A reduction kernel only if _b_offset is not equal to 0

354

if(_b_offset != 0)

355

{

356

CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);

357

}

358

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

359

if(_fuse_output_stage)

360

{

361

// Run offset contribution/output stage kernel

362

CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);

}

else

{

// Run offset contribution kernel

367

CLScheduler::get().enqueue(_offset_contribution_kernel, true);

368

}

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

369

370

_memory_group.release();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

371

}

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

372

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

373

void CLGEMMLowpMatrixMultiplyCore::prepare()

{

if(!_is_prepared)

{

if(_is_interleaved_transposed && _reshape_b_only_on_first_run)

378

{

379

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

380

381

// Run reshape kernel and mark original weights tensor as unused

382

_tmp_b.allocator()->allocate();

383

CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);

384

_original_b->mark_as_unused();

385

}

386

387

// Run matrix B reduction kernel only if _a_offset is not equal to 0

388

if(_a_offset != 0 && _reshape_b_only_on_first_run)

389

{

390

_vector_sum_col.allocator()->allocate();

391

CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);

392

}

393

394

CLScheduler::get().queue().finish();

395

_is_prepared = true;

396

}

Gian Marco