Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2019-10-16 19:21:40 +0100

[diff] [blame]

45

_offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),

46

_tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),

47

_fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

51

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

52

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

53

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

54

ARM_COMPUTE_UNUSED(c);

55

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

56

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

57

const ITensor *matrix_a = a;

58

const ITensor *matrix_b = b;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

59

GEMMInfo info = gemm_info;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

60

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

61

// Clear state

Anthony Barbier

71d9b57

2018-07-06 17:05:59 +0100

[diff] [blame]

62

_mtx_a_reshape_kernel = nullptr;

63

_mtx_b_reshape_kernel = nullptr;

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

64

65

// Set internal variables

Georgios Pinitas

4c5469b

2019-05-21 13:32:43 +0100

[diff] [blame]

66

_a_offset = a->info()->quantization_info().uniform().offset;

67

_b_offset = b->info()->quantization_info().uniform().offset;

Gian Marco

c7f9b89

2017-11-30 14:31:13 +0000

[diff] [blame]

68

_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

69

_reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

70

_is_prepared = false;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

71

_fused_assembly_path = false;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

72

_flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

73

_original_b = b;

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

74

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

75

const ITensor *a_to_use = a;

76

77

// Convert to QASYMM8 -> QASYMM8_SIGNED and back

78

if(_flip_signedness)

79

{

80

const int32_t offset_correction = 128;

81

const DataType dt = DataType::QASYMM8_SIGNED;

82

const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();

83

84

_signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));

85

_memory_group.manage(&_signed_a);

86

_convert_to_signed_asymm.configure(a_to_use, &_signed_a);

87

a_to_use = &_signed_a;

88

_a_offset = _signed_a.info()->quantization_info().uniform().offset;

89

90

const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();

91

_memory_group.manage(&_signed_output);

92

_signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));

93

94

// Output stage correction

95

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

96

output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;

97

output_stage_corr.gemmlowp_min_bound -= offset_correction;

98

output_stage_corr.gemmlowp_max_bound -= offset_correction;

99

info.set_gemmlowp_output_stage(output_stage_corr);

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

100

101

// Update matrix a

102

matrix_a = &_signed_a;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

103

}

104

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

105

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

106

if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

107

{

108

_fuse_output_stage = true;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

109

_memory_group.manage(&_mm_result_s32);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

110

TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

111

_mm_result_s32.allocator()->init(info_mm_result_s32);

112

}

113

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

114

#ifdef __aarch64__

115

switch(a->info()->data_type())

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

116

{

Pablo Tello

66c656a

2018-03-15 10:34:58 +0000

[diff] [blame]

117

case DataType::QASYMM8:

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

118

case DataType::QASYMM8_SIGNED:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

119

case DataType::U8:

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

120

case DataType::S8:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

121

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

122

if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

123

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

124

_asm_glue.configure(a_to_use, b, c, output, gemm_info);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

125

_fused_assembly_path = _asm_glue.is_configured();

126

}

127

else

128

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

129

_asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

130

}

131

_assembly_path = _asm_glue.is_configured();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Datatype not supported");

137

break;

138

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

139

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

140

#endif /* __aarch64__ */

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

141

if(!(_assembly_path || _run_vector_matrix_multiplication))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

142

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

147

TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

148

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

Georgios Pinitas

02acf01

2019-03-19 10:49:03 +0000

[diff] [blame]

149

TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

150

_tmp_a.allocator()->init(a_info);

151

_tmp_b.allocator()->init(b_info);

152

_memory_group.manage(&_tmp_a);

153

if(!_reshape_b_only_on_first_run)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

154

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

155

_memory_group.manage(&_tmp_b);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

156

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

157

158

// Configure interleave kernel

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

159

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

160

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

161

k->configure(a_to_use, &_tmp_a);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

162

_mtx_a_reshape_kernel = std::move(k);

163

}

Gian Marco

c7f9b89

2017-11-30 14:31:13 +0000

[diff] [blame]

164

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

165

// Configure transpose kernel

166

{

167

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

168

k->configure(b, &_tmp_b);

169

_mtx_b_reshape_kernel = std::move(k);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

170

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

171

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

172

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

173

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

174

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

175

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

176

if(_a_offset != 0)

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

177

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

178

TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);

179

180

_vector_sum_col.allocator()->init(info_vector_sum_col);

181

if(!_reshape_b_only_on_first_run)

182

{

183

_memory_group.manage(&_vector_sum_col);

184

}

185

186

// Configure Matrix B reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

187

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

188

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

189

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

190

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

191

if(_b_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

192

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

193

TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

194

195

_vector_sum_row.allocator()->init(info_vector_sum_row);

196

_memory_group.manage(&_vector_sum_row);

197

198

// Configure matrix A reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

199

_mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

200

}

201

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

202

if(_fuse_output_stage)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

203

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

204

// Configure matrix multiply kernel

205

if(!_assembly_path)

206

{

207

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

208

k->configure(matrix_a, matrix_b, &_mm_result_s32);

209

_mm_kernel = std::move(k);

210

}

211

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

212

_offset_contribution_output_stage_kernel.configure(&_mm_result_s32,

213

_a_offset == 0 ? nullptr : &_vector_sum_col,

214

_b_offset == 0 ? nullptr : &_vector_sum_row, c,

215

_flip_signedness ? &_signed_output : output,

216

a->info()->dimension(0),

217

_a_offset, _b_offset, info.gemmlowp_output_stage());

if(_flip_signedness)

{

_convert_from_signed_asymm.configure(&_signed_output, output);

222

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

223

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

224

else

225

{

226

// Configure matrix multiply kernel

227

if(!_assembly_path)

228

{

229

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

230

k->configure(matrix_a, matrix_b, output);

231

_mm_kernel = std::move(k);

232

}

233

// Configure offset contribution kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

234

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

235

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

236

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

237

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

238

// Configure activation

239

const ActivationLayerInfo &activation = gemm_info.activation_info();

240

_run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));

241

if(_run_activation)

242

{

243

_activation_func.configure(output, nullptr, activation);

244

}

245

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

246

// Allocate tensors

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

247

if(!_assembly_path && !_run_vector_matrix_multiplication)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

248

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

249

_tmp_a.allocator()->allocate();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

250

if(!_reshape_b_only_on_first_run)

251

{

252

_tmp_b.allocator()->allocate();

253

}

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

254

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

255

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

256

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

257

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

258

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

259

{

260

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

266

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

267

}

268

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

269

if(_fuse_output_stage)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

270

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

271

_mm_result_s32.allocator()->allocate();

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

272

}

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

if(_flip_signedness)

{

_signed_a.allocator()->allocate();

277

_signed_output.allocator()->allocate();

278

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

279

}

280

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

281

Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

282

{

283

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

284

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

285

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

286

ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

287

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

288

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

289

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

290

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

291

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

292

GEMMInfo info = gemm_info;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

293

const ITensorInfo *matrix_a_info = a;

294

const ITensorInfo *matrix_b_info = b;

295

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

296

const ITensorInfo *a_to_use = a;

297

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

298

TensorInfo tmp_a_info{};

299

TensorInfo tmp_b_info{};

300

TensorInfo mm_result_s32_info{};

301

Georgios Pinitas

37d080f

2019-06-21 18:43:12 +0100

[diff] [blame]

302

int32_t a_offset = a->quantization_info().uniform().offset;

303

int32_t b_offset = b->quantization_info().uniform().offset;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

304

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

305

bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

306

if(fuse_output_stage)

307

{

308

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));

309

}

310

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

311

// Convert QASYMM8->QASYMM8_SIGNED

312

TensorInfo signed_a{};

313

TensorInfo signed_output{};

314

bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();

315

if(flip_signedness)

316

{

317

const int32_t offset_correction = 128;

318

const DataType dt = DataType::QASYMM8_SIGNED;

319

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

320

321

signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

322

ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));

323

a_to_use = &signed_a;

324

a_offset = signed_a.quantization_info().uniform().offset;

325

326

const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

327

signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

328

329

// Output stage correction

330

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

331

output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;

332

output_stage_corr.gemmlowp_min_bound -= offset_correction;

333

output_stage_corr.gemmlowp_max_bound -= offset_correction;

334

info.set_gemmlowp_output_stage(output_stage_corr);

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

335

336

// Update matrix a

337

matrix_a_info = &signed_a;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

338

}

339

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

340

// Check if we need to run the optimized assembly kernel

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

341

bool run_optimised = false;

342

bool run_optimised_requantized = false;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

343

if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

344

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

345

run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

346

run_optimised_requantized = run_optimised;

347

}

348

else

349

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

350

run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

351

}

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

352

353

if(run_optimised)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

354

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

355

ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

356

if(info.depth_output_gemm3d() != 0)

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

357

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

358

if(info.reinterpret_input_as_3d())

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

359

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

360

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

361

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

362

}

363

else

364

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

365

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

366

}

367

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

368

else

369

{

370

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

371

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

372

}

373

else

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

374

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

375

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");

376

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

377

378

const bool run_vector_matrix_multiplication = a->dimension(1) < 2;

379

if(!run_vector_matrix_multiplication)

380

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

381

matrix_a_info = &tmp_a_info;

382

matrix_b_info = &tmp_b_info;

383

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

384

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

385

TensorShape shape_tmp_a = a->tensor_shape();

386

shape_tmp_a.set(0, a->dimension(0) * 4);

387

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

388

389

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

390

TensorShape shape_tmp_b = b->tensor_shape();

391

shape_tmp_b.set(0, b->dimension(1) * 16);

392

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

393

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

394

// Validate interleave kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

395

auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

396

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

397

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

398

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

399

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

400

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

401

}

402

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

403

if(!run_optimised_requantized)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

404

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

405

TensorInfo info_vector_sum_col{};

406

TensorInfo info_vector_sum_row{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

407

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

408

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

409

if(a_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

410

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

411

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

412

413

// Configure Matrix B reduction kernel

414

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

415

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

416

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

417

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

418

if(b_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

419

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

420

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

421

422

// Configure matrix A reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

423

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

424

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

425

426

if(fuse_output_stage)

{

if(!run_optimised)

{

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));

431

}

432

433

// Validate offset contribution kernel

434

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,

435

a_offset == 0 ? nullptr : &info_vector_sum_col,

436

b_offset == 0 ? nullptr : &info_vector_sum_row,

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

437

c,

438

flip_signedness ? &signed_output : output,

439

a_offset, b_offset,

440

info.gemmlowp_output_stage()));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

}

else

{

if(!run_optimised)

{

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));

447

}

448

// Validate offset contribution kernel

449

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

450

a_offset == 0 ? nullptr : &info_vector_sum_col,

451

b_offset == 0 ? nullptr : &info_vector_sum_row,

452

a_offset, b_offset));

453

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

454

}

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

455

456

// Validate activation

457

const ActivationLayerInfo &activation = gemm_info.activation_info();

458

if(activation.enabled())

459

{

460

ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));

461

}

462

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

463

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

464

}

465

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

466

void NEGEMMLowpMatrixMultiplyCore::run()

467

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

468

prepare();

469

Georgios Pinitas

da953f2

2019-04-02 17:27:03 +0100

[diff] [blame]

470

MemoryGroupResourceScope scope_mg(_memory_group);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

471

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

472

// Convert QASYMM8->QASYMM8_SIGNED

473

if(_flip_signedness)

474

{

475

NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);

476

}

477

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

478

// Reshape inputs

479

if(_mtx_a_reshape_kernel)

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

480

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

481

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

482

}

483

if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)

484

{

485

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

486

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

487

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

488

// Run GEMM

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

489

if(_asm_glue.is_configured())

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

490

{

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

491

_asm_glue.run();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}

else

{

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

496

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

497

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

498

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

499

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

500

// Run matrix A reduction kernel only if _b_offset is not equal to 0

501

if(_b_offset != 0)

502

{

503

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

504

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

505

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

506

// Run matrix B reduction kernel only if _a_offset is not equal to 0

507

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

508

{

509

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

510

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

511

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

512

if(_fuse_output_stage)

513

{

514

// Run offset contribution kernel

515

NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);

}

else

{

// Run offset contribution kernel

520

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

521

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

522

}

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

523

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

524

// Convert QASYMM8_SIGNED->QASYMM8

525

if(_flip_signedness)

526

{

527

NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);

528

}

529

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

530

// Run fused activation

531

if(_run_activation)

532

{

533

_activation_func.run();

534

}

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

535

}

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

536

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

537

void NEGEMMLowpMatrixMultiplyCore::prepare()

{

if(!_is_prepared)

{

// Run assembly reshape

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

542

if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

543

{

544

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

545

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

546

_asm_glue.prepare();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

547

_original_b->mark_as_unused();

548

}

549

// Run non-assembly reshape

550

else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)

551

{

552

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

553

554

// Run reshape kernel and mark original weights tensor as unused

555

_tmp_b.allocator()->allocate();

556

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

557

_original_b->mark_as_unused();

558

}

559

560

// Run matrix B reduction kernel only if _a_offset is not equal to 0

561

if(_a_offset != 0 && _reshape_b_only_on_first_run)

562

{

563

_vector_sum_col.allocator()->allocate();

564

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

}

_is_prepared = true;

}

Pablo Tello