Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2018-04-05 17:20:34 +0100

[diff] [blame^]

46

_dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

50

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

51

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

52

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

53

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

54

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

55

_a_offset = a->info()->quantization_info().offset;

56

_b_offset = b->info()->quantization_info().offset;

57

_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

58

_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

59

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

60

#ifdef __aarch64__

61

switch(a->info()->data_type())

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

62

{

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

63

case DataType::S8:

64

{

Pablo Tello

7fad9b1

2018-03-14 17:55:27 +0000

[diff] [blame]

65

_dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

66

break;

67

}

Pablo Tello

66c656a

2018-03-15 10:34:58 +0000

[diff] [blame]

68

case DataType::QASYMM8:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

69

case DataType::U8:

70

{

Pablo Tello

7fad9b1

2018-03-14 17:55:27 +0000

[diff] [blame]

71

_dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Datatype not supported");

77

break;

78

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

79

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

80

#endif /* __aarch64__ */

81

if(!_dot_product_path)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

82

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

83

if(_run_vector_matrix_multiplication)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

84

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

85

// Configure matrix multiply kernel

86

{

87

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

88

k->configure(a, b, output);

89

_mm_kernel = std::move(k);

90

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

91

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

92

else

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

93

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

94

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

95

TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

96

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

97

TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

98

_tmp_a.allocator()->init(info_a);

99

_tmp_b.allocator()->init(info_b);

100

_memory_group.manage(&_tmp_a);

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

101

if(!_reshape_b_only_on_first_run)

102

{

103

_memory_group.manage(&_tmp_b);

104

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

105

106

// Configure interleave kernel

107

{

108

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

109

k->configure(a, &_tmp_a);

110

_mtx_a_reshape_kernel = std::move(k);

111

}

112

113

// Configure transpose kernel

114

{

115

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

116

k->configure(b, &_tmp_b);

117

_mtx_b_reshape_kernel = std::move(k);

118

}

119

120

// Configure matrix multiply kernel

121

{

122

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

123

k->configure(&_tmp_a, &_tmp_b, output);

124

_mm_kernel = std::move(k);

125

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

126

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

127

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

128

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

129

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

130

if(_a_offset != 0)

131

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

132

TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);

133

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

134

_vector_sum_col.allocator()->init(info_vector_sum_col);

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

135

if(!_reshape_b_only_on_first_run)

136

{

137

_memory_group.manage(&_vector_sum_col);

138

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

139

140

// Configure Matrix B reduction kernel

141

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);

142

}

143

144

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

145

if(_b_offset != 0)

146

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

147

TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);

148

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

149

_vector_sum_row.allocator()->init(info_vector_sum_row);

150

_memory_group.manage(&_vector_sum_row);

151

152

// Configure matrix A reduction kernel

153

_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);

154

}

155

156

// Configure offset contribution kernel

157

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);

158

159

// Allocate tensors

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

160

if(!_dot_product_path && !_run_vector_matrix_multiplication)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

161

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

162

_tmp_a.allocator()->allocate();

163

_tmp_b.allocator()->allocate();

164

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

if(_a_offset != 0)

{

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

174

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

175

}

176

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

177

Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

178

{

179

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

180

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

181

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);

182

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

183

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

184

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),

185

"The output matrix must have the same number of rows as the matrix A");

186

ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),

187

"The output matrix must have the same number of columns as the matrix B");

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

188

ARM_COMPUTE_UNUSED(gemm_info);

189

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

190

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

191

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

192

int32_t a_offset = a->quantization_info().offset;

193

int32_t b_offset = b->quantization_info().offset;

194

bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

195

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

196

if(!run_vector_matrix_multiplication)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

197

{

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

198

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

199

TensorShape shape_tmp_a = a->tensor_shape();

200

shape_tmp_a.set(0, a->dimension(0) * 4);

201

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

202

203

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

204

TensorShape shape_tmp_b = b->tensor_shape();

205

shape_tmp_b.set(0, b->dimension(1) * 16);

206

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

207

208

TensorInfo info_a(shape_tmp_a, 1, a->data_type());

209

TensorInfo info_b(shape_tmp_b, 1, b->data_type());

210

211

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));

212

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

213

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

214

}

215

else

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

216

{

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

217

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

218

}

219

220

TensorInfo info_vector_sum_col, info_vector_sum_row;

221

222

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

223

if(a_offset != 0)

224

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

225

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

226

227

// Configure Matrix B reduction kernel

228

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

229

}

230

231

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

232

if(b_offset != 0)

233

{

Isabella Gottardi

2018-01-18 15:50:39 +0000

[diff] [blame]

234

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

235

236

// Configure matrix A reduction kernel

237

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));

238

}

239

240

// Validate offset contribution kernel

241

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

242

a_offset == 0 ? nullptr : &info_vector_sum_col,

243

b_offset == 0 ? nullptr : &info_vector_sum_row,

244

a_offset, b_offset));

245

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

246

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

247

}

248

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

249

void NEGEMMLowpMatrixMultiplyCore::run()

250

{

251

_memory_group.acquire();

252

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

253

// Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction

254

if(!_run_vector_matrix_multiplication && !_dot_product_path)

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

255

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

256

if(_mtx_a_reshape_kernel)

257

{

258

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

259

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

260

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

261

if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

262

{

263

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

264

}

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

265

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

266

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

267

if(_asm_glue_unsigned._optimised_kernel != nullptr)

268

{

269

_asm_glue_unsigned.run();

270

}

271

else if(_asm_glue_signed._optimised_kernel != nullptr)

272

{

273

_asm_glue_signed.run();

}

else

{

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

278

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

279

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

280

// Run matrix A reduction kernel only if _b_offset is not equal to 0

281

if(_b_offset != 0)

282

{

283

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

284

}

285

286

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

287

if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

288

{

289

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

290

}

291

292

// Run offset contribution kernel

293

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

294

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

295

_memory_group.release();

Giorgio Arena

2018-04-05 17:20:34 +0100

[diff] [blame^]

296

297

_is_first_run = false;

Pablo Tello