Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)

56

{

Georgios Pinitas

a3b1b46

2017-11-16 19:24:39 +0000

[diff] [blame]

57

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

58

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

59

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

60

bool dot_product_path = false;

61

62

_a_offset = a->info()->quantization_info().offset;

63

_b_offset = b->info()->quantization_info().offset;

64

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

65

#ifdef ARM_COMPUTE_AARCH64_V8_2

66

// Check for DOT product instruction

67

const struct CPUInfo ci = NEScheduler::get().cpu_info();

68

const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);

69

70

if(cpu_has_dotprod != 0)

71

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

72

dot_product_path = true;

73

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

74

// Configure matrix multiply kernel

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

75

struct CPUInfo ci = NEScheduler::get().cpu_info();

76

const int M = output->info()->tensor_shape().y();

77

const int N = output->info()->tensor_shape().x();

78

const int K = a->info()->tensor_shape().x();

79

Pablo Tello

6681d24

2017-11-13 16:44:08 +0000

[diff] [blame]

80

const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();

81

constexpr size_t alignment = 4096;

82

_workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

83

_memory_group.manage(&_workspace);

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

84

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

85

// Configure matrix multiplication kernel

86

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();

87

k->configure(a, b, output, &_workspace, 1.f, 1.f);

88

_mm_kernel = std::move(k);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

89

}

90

else

91

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

92

{

93

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

94

TensorShape shape_tmp_a = a->info()->tensor_shape();

95

shape_tmp_a.set(0, a->info()->dimension(0) * 4);

96

shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));

97

98

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

99

TensorShape shape_tmp_b = b->info()->tensor_shape();

100

shape_tmp_b.set(0, b->info()->dimension(1) * 16);

101

shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));

102

103

TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());

104

TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());

105

_tmp_a.allocator()->init(info_a);

106

_tmp_b.allocator()->init(info_b);

107

_memory_group.manage(&_tmp_a);

108

_memory_group.manage(&_tmp_b);

109

110

// Configure interleave kernel

111

{

112

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

113

k->configure(a, &_tmp_a);

114

_mtx_a_reshape_kernel = std::move(k);

115

}

116

117

// Configure transpose kernel

118

{

119

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

120

k->configure(b, &_tmp_b);

121

_mtx_b_reshape_kernel = std::move(k);

122

}

123

124

// Configure matrix multiply kernel

125

{

126

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

127

k->configure(&_tmp_a, &_tmp_b, output);

128

_mm_kernel = std::move(k);

129

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

130

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

131

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

132

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

133

if(_a_offset != 0)

134

{

135

TensorShape shape_vector_sum_col = b->info()->tensor_shape();

Gian Marco

05288a2

2017-11-21 10:57:50 +0000

[diff] [blame]

136

if(b->info()->num_dimensions() > 1)

137

{

138

shape_vector_sum_col.remove_dimension(1);

139

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

140

TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);

141

_vector_sum_col.allocator()->init(info_vector_sum_col);

142

_memory_group.manage(&_vector_sum_col);

143

144

// Configure Matrix B reduction kernel

145

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);

146

}

147

148

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

149

if(_b_offset != 0)

150

{

151

TensorShape shape_vector_sum_row = a->info()->tensor_shape();

152

shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));

Gian Marco

05288a2

2017-11-21 10:57:50 +0000

[diff] [blame]

153

if(a->info()->num_dimensions() > 1)

154

{

155

shape_vector_sum_row.remove_dimension(1);

156

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

157

TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);

158

_vector_sum_row.allocator()->init(info_vector_sum_row);

159

_memory_group.manage(&_vector_sum_row);

160

161

// Configure matrix A reduction kernel

162

_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);

163

}

164

165

// Configure offset contribution kernel

166

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);

167

168

// Allocate tensors

169

if(!dot_product_path)

170

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

171

_tmp_a.allocator()->allocate();

172

_tmp_b.allocator()->allocate();

173

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

174

else

175

{

176

_workspace.allocator()->allocate();

}

if(_a_offset != 0)

{

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

187

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

188

}

189

Georgios Pinitas

a3b1b46

2017-11-16 19:24:39 +0000

[diff] [blame]

190

Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output)

191

{

192

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

193

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

194

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);

195

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

196

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

197

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),

198

"The output matrix must have the same number of rows as the matrix A");

199

ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),

200

"The output matrix must have the same number of columns as the matrix B");

201

202

int32_t a_offset = a->quantization_info().offset;

203

int32_t b_offset = b->quantization_info().offset;

204

205

#ifdef ARM_COMPUTE_AARCH64_V8_2

206

// Check for DOT product instruction

207

const struct CPUInfo ci = NEScheduler::get().cpu_info();

208

const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);

209

210

if(cpu_has_dotprod != 0)

211

{

212

// Validate matrix multiply kernel

213

ARM_COMPUTE_RETURN_ERROR_ON(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));

214

}

215

else

216

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

217

{

218

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

219

TensorShape shape_tmp_a = a->tensor_shape();

220

shape_tmp_a.set(0, a->dimension(0) * 4);

221

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

222

223

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

224

TensorShape shape_tmp_b = b->tensor_shape();

225

shape_tmp_b.set(0, b->dimension(1) * 16);

226

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

227

228

TensorInfo info_a(shape_tmp_a, 1, a->data_type());

229

TensorInfo info_b(shape_tmp_b, 1, b->data_type());

230

231

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));

232

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

233

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));

234

}

235

236

TensorInfo info_vector_sum_col, info_vector_sum_row;

237

238

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

239

if(a_offset != 0)

240

{

241

TensorShape shape_vector_sum_col = b->tensor_shape();

242

shape_vector_sum_col.remove_dimension(1);

243

info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);

244

245

// Configure Matrix B reduction kernel

246

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

247

}

248

249

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

250

if(b_offset != 0)

251

{

252

TensorShape shape_vector_sum_row = a->tensor_shape();

253

shape_vector_sum_row.set(Window::DimX, a->dimension(1));

254

shape_vector_sum_row.remove_dimension(1);

255

info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);

256

257

// Configure matrix A reduction kernel

258

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));

259

}

260

261

// Validate offset contribution kernel

262

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

263

a_offset == 0 ? nullptr : &info_vector_sum_col,

264

b_offset == 0 ? nullptr : &info_vector_sum_row,

265

a_offset, b_offset));

return Error{};

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

270

void NEGEMMLowpMatrixMultiplyCore::run()

271

{

272

_memory_group.acquire();

273

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

274

if(_mtx_a_reshape_kernel)

275

{

276

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

277

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

278

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

279

if(_mtx_b_reshape_kernel)

280

{

281

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

282

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

283

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

284

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

285

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

286

// Run matrix A reduction kernel only if _b_offset is not equal to 0

287

if(_b_offset != 0)

288

{

289

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

290

}

291

292

// Run matrix B reduction kernel only if _a_offset is not equal to 0

293

if(_a_offset != 0)

294

{

295

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

296

}

297

298

// Run offset contribution kernel

299

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

300

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

301

_memory_group.release();

Pablo Tello