Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

50

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

51

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

52

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

53

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

54

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

55

// Clear state

Anthony Barbier

71d9b57

2018-07-06 17:05:59 +0100

[diff] [blame]

56

_mtx_a_reshape_kernel = nullptr;

57

_mtx_b_reshape_kernel = nullptr;

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

58

59

// Set internal variables

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

60

_a_offset = a->info()->quantization_info().offset;

61

_b_offset = b->info()->quantization_info().offset;

62

_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame]

63

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

64

_is_prepared = false;

65

_original_b = b;

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

66

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

67

#ifdef __aarch64__

68

switch(a->info()->data_type())

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

69

{

Pablo Tello

66c656a

2018-03-15 10:34:58 +0000

[diff] [blame]

70

case DataType::QASYMM8:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

71

case DataType::U8:

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

72

case DataType::S8:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

73

{

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

74

_asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);

75

_dot_product_path = _asm_glue.is_configured();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Datatype not supported");

81

break;

82

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

83

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

84

#endif /* __aarch64__ */

85

if(!_dot_product_path)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

86

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

87

if(_run_vector_matrix_multiplication)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

88

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

89

// Configure matrix multiply kernel

90

{

91

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

92

k->configure(a, b, output);

93

_mm_kernel = std::move(k);

94

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

95

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

96

else

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

97

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

98

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

99

TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

100

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

101

TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

102

_tmp_a.allocator()->init(info_a);

103

_tmp_b.allocator()->init(info_b);

104

_memory_group.manage(&_tmp_a);

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame]

105

if(!_reshape_b_only_on_first_run)

106

{

107

_memory_group.manage(&_tmp_b);

108

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

109

110

// Configure interleave kernel

111

{

112

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

113

k->configure(a, &_tmp_a);

114

_mtx_a_reshape_kernel = std::move(k);

115

}

116

117

// Configure transpose kernel

118

{

119

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

120

k->configure(b, &_tmp_b);

121

_mtx_b_reshape_kernel = std::move(k);

122

}

123

124

// Configure matrix multiply kernel

125

{

126

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

127

k->configure(&_tmp_a, &_tmp_b, output);

128

_mm_kernel = std::move(k);

129

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

130

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

131

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

132

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

133

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

134

if(_a_offset != 0)

135

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

136

TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);

137

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

138

_vector_sum_col.allocator()->init(info_vector_sum_col);

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame]

139

if(!_reshape_b_only_on_first_run)

140

{

141

_memory_group.manage(&_vector_sum_col);

142

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

143

144

// Configure Matrix B reduction kernel

145

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);

146

}

147

148

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

149

if(_b_offset != 0)

150

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

151

TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);

152

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

153

_vector_sum_row.allocator()->init(info_vector_sum_row);

154

_memory_group.manage(&_vector_sum_row);

155

156

// Configure matrix A reduction kernel

157

_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);

158

}

159

160

// Configure offset contribution kernel

161

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);

162

163

// Allocate tensors

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

164

if(!_dot_product_path && !_run_vector_matrix_multiplication)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

165

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

166

_tmp_a.allocator()->allocate();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

167

if(!_reshape_b_only_on_first_run)

168

{

169

_tmp_b.allocator()->allocate();

170

}

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

171

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

172

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

173

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

174

{

175

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

181

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

182

}

183

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

184

Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

185

{

186

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

187

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

188

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);

189

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

190

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

191

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),

192

"The output matrix must have the same number of rows as the matrix A");

193

ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),

194

"The output matrix must have the same number of columns as the matrix B");

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

195

ARM_COMPUTE_UNUSED(gemm_info);

196

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

197

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

Gian Marco Iodice

db9d46d

2018-08-08 12:29:38 +0100

[diff] [blame]

198

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");

199

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

200

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

201

int32_t a_offset = a->quantization_info().offset;

202

int32_t b_offset = b->quantization_info().offset;

203

bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

204

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

205

if(!run_vector_matrix_multiplication)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

206

{

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

207

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

208

TensorShape shape_tmp_a = a->tensor_shape();

209

shape_tmp_a.set(0, a->dimension(0) * 4);

210

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

211

212

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

213

TensorShape shape_tmp_b = b->tensor_shape();

214

shape_tmp_b.set(0, b->dimension(1) * 16);

215

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

216

217

TensorInfo info_a(shape_tmp_a, 1, a->data_type());

218

TensorInfo info_b(shape_tmp_b, 1, b->data_type());

219

220

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));

221

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

222

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

223

}

224

else

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

225

{

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

226

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

227

}

228

229

TensorInfo info_vector_sum_col, info_vector_sum_row;

230

231

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

232

if(a_offset != 0)

233

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

234

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

235

236

// Configure Matrix B reduction kernel

237

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

238

}

239

240

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

241

if(b_offset != 0)

242

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

243

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

244

245

// Configure matrix A reduction kernel

246

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));

247

}

248

249

// Validate offset contribution kernel

250

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

251

a_offset == 0 ? nullptr : &info_vector_sum_col,

252

b_offset == 0 ? nullptr : &info_vector_sum_row,

253

a_offset, b_offset));

254

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

255

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

256

}

257

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

258

void NEGEMMLowpMatrixMultiplyCore::run()

259

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

260

prepare();

261

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

262

_memory_group.acquire();

263

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

264

// Reshape inputs

265

if(_mtx_a_reshape_kernel)

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

266

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

267

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

268

}

269

if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)

270

{

271

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

272

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

273

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

274

// Run GEMM

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

275

if(_asm_glue.is_configured())

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

276

{

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

277

_asm_glue.run();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}

else

{

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

282

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

283

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

284

// Run matrix A reduction kernel only if _b_offset is not equal to 0

285

if(_b_offset != 0)

286

{

287

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

288

}

289

290

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

291

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

292

{

293

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

294

}

295

296

// Run offset contribution kernel

297

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

298

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

299

_memory_group.release();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

300

}

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame]

301

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

302

void NEGEMMLowpMatrixMultiplyCore::prepare()

{

if(!_is_prepared)

{

// Run assembly reshape

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

307

if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

308

{

309

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

310

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

311

_asm_glue.prepare();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

312

_original_b->mark_as_unused();

313

}

314

// Run non-assembly reshape

315

else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)

316

{

317

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

318

319

// Run reshape kernel and mark original weights tensor as unused

320

_tmp_b.allocator()->allocate();

321

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

322

_original_b->mark_as_unused();

323

}

324

325

// Run matrix B reduction kernel only if _a_offset is not equal to 0

326

if(_a_offset != 0 && _reshape_b_only_on_first_run)

327

{

328

_vector_sum_col.allocator()->allocate();

329

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

}

_is_prepared = true;

}

Pablo Tello