Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2017-11-30 14:31:13 +0000

[diff] [blame]

51

_offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

55

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

56

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

57

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

58

ARM_COMPUTE_UNUSED(gemm_info);

59

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

60

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

61

_a_offset = a->info()->quantization_info().offset;

62

_b_offset = b->info()->quantization_info().offset;

63

_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

64

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

65

#ifdef ARM_COMPUTE_AARCH64_V8_2

66

// Check for DOT product instruction

67

const struct CPUInfo ci = NEScheduler::get().cpu_info();

68

const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);

69

70

if(cpu_has_dotprod != 0)

71

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

72

_dot_product_path = true;

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

73

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

74

// Configure matrix multiply kernel

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

75

struct CPUInfo ci = NEScheduler::get().cpu_info();

76

const int M = output->info()->tensor_shape().y();

77

const int N = output->info()->tensor_shape().x();

78

const int K = a->info()->tensor_shape().x();

79

Pablo Tello

6681d24

2017-11-13 16:44:08 +0000

[diff] [blame]

80

const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();

81

constexpr size_t alignment = 4096;

82

_workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

83

_memory_group.manage(&_workspace);

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

84

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

85

// Configure matrix multiplication kernel

86

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();

87

k->configure(a, b, output, &_workspace, 1.f, 1.f);

88

_mm_kernel = std::move(k);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

89

}

90

else

91

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

92

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

93

if(_run_vector_matrix_multiplication)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

94

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

95

// Configure matrix multiply kernel

96

{

97

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

98

k->configure(a, b, output);

99

_mm_kernel = std::move(k);

100

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

101

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

102

else

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

103

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

104

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

105

TensorShape shape_tmp_a = a->info()->tensor_shape();

106

shape_tmp_a.set(0, a->info()->dimension(0) * 4);

107

shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

108

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

109

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

110

TensorShape shape_tmp_b = b->info()->tensor_shape();

111

shape_tmp_b.set(0, b->info()->dimension(1) * 16);

112

shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));

113

114

TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());

115

TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());

116

_tmp_a.allocator()->init(info_a);

117

_tmp_b.allocator()->init(info_b);

118

_memory_group.manage(&_tmp_a);

119

_memory_group.manage(&_tmp_b);

120

121

// Configure interleave kernel

122

{

123

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

124

k->configure(a, &_tmp_a);

125

_mtx_a_reshape_kernel = std::move(k);

126

}

127

128

// Configure transpose kernel

129

{

130

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

131

k->configure(b, &_tmp_b);

132

_mtx_b_reshape_kernel = std::move(k);

133

}

134

135

// Configure matrix multiply kernel

136

{

137

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

138

k->configure(&_tmp_a, &_tmp_b, output);

139

_mm_kernel = std::move(k);

140

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

141

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

142

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

143

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

144

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

145

if(_a_offset != 0)

146

{

147

TensorShape shape_vector_sum_col = b->info()->tensor_shape();

Gian Marco

05288a2

2017-11-21 10:57:50 +0000

[diff] [blame]

148

if(b->info()->num_dimensions() > 1)

149

{

150

shape_vector_sum_col.remove_dimension(1);

151

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

152

TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);

153

_vector_sum_col.allocator()->init(info_vector_sum_col);

154

_memory_group.manage(&_vector_sum_col);

155

156

// Configure Matrix B reduction kernel

157

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);

158

}

159

160

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

161

if(_b_offset != 0)

162

{

163

TensorShape shape_vector_sum_row = a->info()->tensor_shape();

164

shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));

Gian Marco

05288a2

2017-11-21 10:57:50 +0000

[diff] [blame]

165

if(a->info()->num_dimensions() > 1)

166

{

167

shape_vector_sum_row.remove_dimension(1);

168

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

169

TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);

170

_vector_sum_row.allocator()->init(info_vector_sum_row);

171

_memory_group.manage(&_vector_sum_row);

172

173

// Configure matrix A reduction kernel

174

_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);

175

}

176

177

// Configure offset contribution kernel

178

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);

179

180

// Allocate tensors

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

181

if(!_dot_product_path && !_run_vector_matrix_multiplication)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

182

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

183

_tmp_a.allocator()->allocate();

184

_tmp_b.allocator()->allocate();

185

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

186

else

187

{

188

_workspace.allocator()->allocate();

}

if(_a_offset != 0)

{

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

199

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

200

}

201

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

202

Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

203

{

204

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

205

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

206

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);

207

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

208

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

209

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),

210

"The output matrix must have the same number of rows as the matrix A");

211

ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),

212

"The output matrix must have the same number of columns as the matrix B");

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

213

ARM_COMPUTE_UNUSED(gemm_info);

214

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

215

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

216

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

217

int32_t a_offset = a->quantization_info().offset;

218

int32_t b_offset = b->quantization_info().offset;

219

bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

220

221

#ifdef ARM_COMPUTE_AARCH64_V8_2

222

// Check for DOT product instruction

223

const struct CPUInfo ci = NEScheduler::get().cpu_info();

224

const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);

225

226

if(cpu_has_dotprod != 0)

227

{

228

// Validate matrix multiply kernel

229

ARM_COMPUTE_RETURN_ERROR_ON(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));

230

}

231

else

232

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

233

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

234

if(!run_vector_matrix_multiplication)

235

{

236

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

237

TensorShape shape_tmp_a = a->tensor_shape();

238

shape_tmp_a.set(0, a->dimension(0) * 4);

239

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

240

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

241

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

242

TensorShape shape_tmp_b = b->tensor_shape();

243

shape_tmp_b.set(0, b->dimension(1) * 16);

244

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

245

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

246

TensorInfo info_a(shape_tmp_a, 1, a->data_type());

247

TensorInfo info_b(shape_tmp_b, 1, b->data_type());

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

248

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

249

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));

250

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

251

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));

}

else

{

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));

256

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

257

}

258

259

TensorInfo info_vector_sum_col, info_vector_sum_row;

260

261

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

262

if(a_offset != 0)

263

{

264

TensorShape shape_vector_sum_col = b->tensor_shape();

265

shape_vector_sum_col.remove_dimension(1);

266

info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);

267

268

// Configure Matrix B reduction kernel

269

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

270

}

271

272

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

273

if(b_offset != 0)

274

{

275

TensorShape shape_vector_sum_row = a->tensor_shape();

276

shape_vector_sum_row.set(Window::DimX, a->dimension(1));

277

shape_vector_sum_row.remove_dimension(1);

278

info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);

279

280

// Configure matrix A reduction kernel

281

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));

282

}

283

284

// Validate offset contribution kernel

285

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

286

a_offset == 0 ? nullptr : &info_vector_sum_col,

287

b_offset == 0 ? nullptr : &info_vector_sum_row,

288

a_offset, b_offset));

289

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

290

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

291

}

292

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

293

void NEGEMMLowpMatrixMultiplyCore::run()

294

{

295

_memory_group.acquire();

296

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

297

// Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction

298

if(!_run_vector_matrix_multiplication && !_dot_product_path)

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

299

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

300

if(_mtx_a_reshape_kernel)

301

{

302

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

303

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

304

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

305

if(_mtx_b_reshape_kernel)

306

{

307

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

308

}

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

309

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

310

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

311

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

312

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

313

// Run matrix A reduction kernel only if _b_offset is not equal to 0

314

if(_b_offset != 0)

315

{

316

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

317

}

318

319

// Run matrix B reduction kernel only if _a_offset is not equal to 0

320

if(_a_offset != 0)

321

{

322

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

323

}

324

325

// Run offset contribution kernel

326

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

327

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

328

_memory_group.release();

Pablo Tello