Blame - src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2019-10-16 19:21:40 +0100

[diff] [blame^]

45

_offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),

46

_tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),

47

_fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

51

void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

52

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

53

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

54

ARM_COMPUTE_UNUSED(c);

55

ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

56

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

57

const ITensor *matrix_a = a;

58

const ITensor *matrix_b = b;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

59

GEMMInfo info = gemm_info;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

60

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

61

// Clear state

Anthony Barbier

71d9b57

2018-07-06 17:05:59 +0100

[diff] [blame]

62

_mtx_a_reshape_kernel = nullptr;

63

_mtx_b_reshape_kernel = nullptr;

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

64

65

// Set internal variables

Georgios Pinitas

4c5469b

2019-05-21 13:32:43 +0100

[diff] [blame]

66

_a_offset = a->info()->quantization_info().uniform().offset;

67

_b_offset = b->info()->quantization_info().uniform().offset;

Gian Marco

c7f9b89

2017-11-30 14:31:13 +0000

[diff] [blame]

68

_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

69

_reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

70

_is_prepared = false;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

71

_fused_assembly_path = false;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

72

_flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

73

_original_b = b;

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

74

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

75

const ITensor *a_to_use = a;

76

77

// Convert to QASYMM8 -> QASYMM8_SIGNED and back

78

if(_flip_signedness)

79

{

80

const int32_t offset_correction = 128;

81

const DataType dt = DataType::QASYMM8_SIGNED;

82

const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();

83

84

_signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));

85

_memory_group.manage(&_signed_a);

86

_convert_to_signed_asymm.configure(a_to_use, &_signed_a);

87

a_to_use = &_signed_a;

88

_a_offset = _signed_a.info()->quantization_info().uniform().offset;

89

90

const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();

91

_memory_group.manage(&_signed_output);

92

_signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));

93

94

// Output stage correction

95

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

96

output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;

97

output_stage_corr.gemmlowp_min_bound -= offset_correction;

98

output_stage_corr.gemmlowp_max_bound -= offset_correction;

99

info.set_gemmlowp_output_stage(output_stage_corr);

100

}

101

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

102

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

103

if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

104

{

105

_fuse_output_stage = true;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

106

_memory_group.manage(&_mm_result_s32);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

107

TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

108

_mm_result_s32.allocator()->init(info_mm_result_s32);

109

}

110

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

111

#ifdef __aarch64__

112

switch(a->info()->data_type())

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

113

{

Pablo Tello

66c656a

2018-03-15 10:34:58 +0000

[diff] [blame]

114

case DataType::QASYMM8:

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

115

case DataType::QASYMM8_SIGNED:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

116

case DataType::U8:

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

117

case DataType::S8:

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

118

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

119

if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

120

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

121

_asm_glue.configure(a_to_use, b, c, output, gemm_info);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

122

_fused_assembly_path = _asm_glue.is_configured();

123

}

124

else

125

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

126

_asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

127

}

128

_assembly_path = _asm_glue.is_configured();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Datatype not supported");

134

break;

135

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

136

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

137

#endif /* __aarch64__ */

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

138

if(!(_assembly_path || _run_vector_matrix_multiplication))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

139

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

144

TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

145

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

Georgios Pinitas

02acf01

2019-03-19 10:49:03 +0000

[diff] [blame]

146

TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

147

_tmp_a.allocator()->init(a_info);

148

_tmp_b.allocator()->init(b_info);

149

_memory_group.manage(&_tmp_a);

150

if(!_reshape_b_only_on_first_run)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

151

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

152

_memory_group.manage(&_tmp_b);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

153

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

154

155

// Configure interleave kernel

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

156

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

157

auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

158

k->configure(a_to_use, &_tmp_a);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

159

_mtx_a_reshape_kernel = std::move(k);

160

}

Gian Marco

c7f9b89

2017-11-30 14:31:13 +0000

[diff] [blame]

161

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

162

// Configure transpose kernel

163

{

164

auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();

165

k->configure(b, &_tmp_b);

166

_mtx_b_reshape_kernel = std::move(k);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

167

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

168

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

169

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

170

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

171

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

172

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

173

if(_a_offset != 0)

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

174

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

175

TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);

176

177

_vector_sum_col.allocator()->init(info_vector_sum_col);

178

if(!_reshape_b_only_on_first_run)

179

{

180

_memory_group.manage(&_vector_sum_col);

181

}

182

183

// Configure Matrix B reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

184

_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

185

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

186

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

187

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

188

if(_b_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

189

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

190

TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

191

192

_vector_sum_row.allocator()->init(info_vector_sum_row);

193

_memory_group.manage(&_vector_sum_row);

194

195

// Configure matrix A reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

196

_mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

197

}

198

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

199

if(_fuse_output_stage)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

200

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

201

// Configure matrix multiply kernel

202

if(!_assembly_path)

203

{

204

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

205

k->configure(matrix_a, matrix_b, &_mm_result_s32);

206

_mm_kernel = std::move(k);

207

}

208

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

209

_offset_contribution_output_stage_kernel.configure(&_mm_result_s32,

210

_a_offset == 0 ? nullptr : &_vector_sum_col,

211

_b_offset == 0 ? nullptr : &_vector_sum_row, c,

212

_flip_signedness ? &_signed_output : output,

213

a->info()->dimension(0),

214

_a_offset, _b_offset, info.gemmlowp_output_stage());

if(_flip_signedness)

{

_convert_from_signed_asymm.configure(&_signed_output, output);

219

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

220

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

221

else

222

{

223

// Configure matrix multiply kernel

224

if(!_assembly_path)

225

{

226

auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();

227

k->configure(matrix_a, matrix_b, output);

228

_mm_kernel = std::move(k);

229

}

230

// Configure offset contribution kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

231

_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

232

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

233

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

234

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

235

// Configure activation

236

const ActivationLayerInfo &activation = gemm_info.activation_info();

237

_run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));

238

if(_run_activation)

239

{

240

_activation_func.configure(output, nullptr, activation);

241

}

242

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

243

// Allocate tensors

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

244

if(!_assembly_path && !_run_vector_matrix_multiplication)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

245

{

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

246

_tmp_a.allocator()->allocate();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

247

if(!_reshape_b_only_on_first_run)

248

{

249

_tmp_b.allocator()->allocate();

250

}

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

251

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

252

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

253

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

254

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

255

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

256

{

257

_vector_sum_col.allocator()->allocate();

}

if(_b_offset != 0)

{

_vector_sum_row.allocator()->allocate();

263

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

264

}

265

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

266

if(_fuse_output_stage)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

267

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

268

_mm_result_s32.allocator()->allocate();

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

269

}

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

if(_flip_signedness)

{

_signed_a.allocator()->allocate();

274

_signed_output.allocator()->allocate();

275

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

276

}

277

Gian Marco Iodice

4b90865

2018-10-18 10:21:02 +0100

[diff] [blame]

278

Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

279

{

280

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

281

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

282

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

283

ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

284

ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),

285

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

286

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

287

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

288

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

289

GEMMInfo info = gemm_info;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

290

const ITensorInfo *matrix_a_info = a;

291

const ITensorInfo *matrix_b_info = b;

292

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

293

const ITensorInfo *a_to_use = a;

294

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

295

TensorInfo tmp_a_info{};

296

TensorInfo tmp_b_info{};

297

TensorInfo mm_result_s32_info{};

298

Georgios Pinitas

37d080f

2019-06-21 18:43:12 +0100

[diff] [blame]

299

int32_t a_offset = a->quantization_info().uniform().offset;

300

int32_t b_offset = b->quantization_info().uniform().offset;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

301

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

302

bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

303

if(fuse_output_stage)

304

{

305

auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));

306

}

307

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

308

// Convert QASYMM8->QASYMM8_SIGNED

309

TensorInfo signed_a{};

310

TensorInfo signed_output{};

311

bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();

312

if(flip_signedness)

313

{

314

const int32_t offset_correction = 128;

315

const DataType dt = DataType::QASYMM8_SIGNED;

316

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

317

318

signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

319

ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));

320

a_to_use = &signed_a;

321

a_offset = signed_a.quantization_info().uniform().offset;

322

323

const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

324

signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

325

326

// Output stage correction

327

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

328

output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;

329

output_stage_corr.gemmlowp_min_bound -= offset_correction;

330

output_stage_corr.gemmlowp_max_bound -= offset_correction;

331

info.set_gemmlowp_output_stage(output_stage_corr);

332

}

333

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

334

// Check if we need to run the optimized assembly kernel

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

335

bool run_optimised = false;

336

bool run_optimised_requantized = false;

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

337

if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

338

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

339

run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

340

run_optimised_requantized = run_optimised;

341

}

342

else

343

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

344

run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

345

}

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

346

347

if(run_optimised)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

348

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

349

ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

350

if(info.depth_output_gemm3d() != 0)

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

351

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

352

if(info.reinterpret_input_as_3d())

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

353

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

354

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

355

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

356

}

357

else

358

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

359

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

360

}

361

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

362

else

363

{

364

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

365

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

366

}

367

else

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

368

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

369

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");

370

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

371

372

const bool run_vector_matrix_multiplication = a->dimension(1) < 2;

373

if(!run_vector_matrix_multiplication)

374

{

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

375

matrix_a_info = &tmp_a_info;

376

matrix_b_info = &tmp_b_info;

377

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

378

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

379

TensorShape shape_tmp_a = a->tensor_shape();

380

shape_tmp_a.set(0, a->dimension(0) * 4);

381

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

382

383

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

384

TensorShape shape_tmp_b = b->tensor_shape();

385

shape_tmp_b.set(0, b->dimension(1) * 16);

386

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

387

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

388

// Validate interleave kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

389

auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

390

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

391

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

392

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

393

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));

Georgios Pinitas

2018-11-08 10:22:01 +0000

[diff] [blame]

394

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

395

}

396

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

397

if(!run_optimised_requantized)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

398

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

399

TensorInfo info_vector_sum_col{};

400

TensorInfo info_vector_sum_row{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

401

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

402

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

403

if(a_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

404

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

405

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

406

407

// Configure Matrix B reduction kernel

408

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

409

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

410

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

411

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

412

if(b_offset != 0)

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

413

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

414

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

415

416

// Configure matrix A reduction kernel

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

417

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

418

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

419

420

if(fuse_output_stage)

{

if(!run_optimised)

{

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));

425

}

426

427

// Validate offset contribution kernel

428

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,

429

a_offset == 0 ? nullptr : &info_vector_sum_col,

430

b_offset == 0 ? nullptr : &info_vector_sum_row,

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

431

c,

432

flip_signedness ? &signed_output : output,

433

a_offset, b_offset,

434

info.gemmlowp_output_stage()));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

}

else

{

if(!run_optimised)

{

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));

441

}

442

// Validate offset contribution kernel

443

ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,

444

a_offset == 0 ? nullptr : &info_vector_sum_col,

445

b_offset == 0 ? nullptr : &info_vector_sum_row,

446

a_offset, b_offset));

447

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

448

}

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

449

450

// Validate activation

451

const ActivationLayerInfo &activation = gemm_info.activation_info();

452

if(activation.enabled())

453

{

454

ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));

455

}

456

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

457

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

458

}

459

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

460

void NEGEMMLowpMatrixMultiplyCore::run()

461

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

462

prepare();

463

Georgios Pinitas

da953f2

2019-04-02 17:27:03 +0100

[diff] [blame]

464

MemoryGroupResourceScope scope_mg(_memory_group);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

465

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

466

// Reshape inputs

467

if(_mtx_a_reshape_kernel)

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

468

{

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

469

NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);

470

}

471

if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)

472

{

473

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

Pablo Tello

2017-11-02 16:09:35 +0000

[diff] [blame]

474

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

475

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

476

// Convert QASYMM8->QASYMM8_SIGNED

477

if(_flip_signedness)

478

{

479

NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);

480

}

481

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

482

// Run GEMM

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

483

if(_asm_glue.is_configured())

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

484

{

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

485

_asm_glue.run();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}

else

{

NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);

490

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

491

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

492

if(!_fused_assembly_path)

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

493

{

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

494

// Run matrix A reduction kernel only if _b_offset is not equal to 0

495

if(_b_offset != 0)

496

{

497

NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);

498

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

499

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

500

// Run matrix B reduction kernel only if _a_offset is not equal to 0

501

if(_a_offset != 0 && !_reshape_b_only_on_first_run)

502

{

503

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

504

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

505

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

506

if(_fuse_output_stage)

507

{

508

// Run offset contribution kernel

509

NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);

}

else

{

// Run offset contribution kernel

514

NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);

515

}

George Wort

2019-02-22 16:37:41 +0000

[diff] [blame]

516

}

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

517

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame^]

518

// Convert QASYMM8_SIGNED->QASYMM8

519

if(_flip_signedness)

520

{

521

NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);

522

}

523

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

524

// Run fused activation

525

if(_run_activation)

526

{

527

_activation_func.run();

528

}

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

529

}

Giorgio Arena

bb54e4e

2018-04-05 17:20:34 +0100

[diff] [blame]

530

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

531

void NEGEMMLowpMatrixMultiplyCore::prepare()

{

if(!_is_prepared)

{

// Run assembly reshape

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

536

if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

537

{

538

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

539

Anthony Barbier

2018-07-20 17:49:35 +0100

[diff] [blame]

540

_asm_glue.prepare();

Georgios Pinitas

2018-06-05 14:56:06 +0100

[diff] [blame]

541

_original_b->mark_as_unused();

542

}

543

// Run non-assembly reshape

544

else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)

545

{

546

ARM_COMPUTE_ERROR_ON(!_original_b->is_used());

547

548

// Run reshape kernel and mark original weights tensor as unused

549

_tmp_b.allocator()->allocate();

550

NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);

551

_original_b->mark_as_unused();

552

}

553

554

// Run matrix B reduction kernel only if _a_offset is not equal to 0

555

if(_a_offset != 0 && _reshape_b_only_on_first_run)

556

{

557

_vector_sum_col.allocator()->allocate();

558

NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

}

_is_prepared = true;

}

Pablo Tello