Blame - src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2021-06-18 15:47:28 +0100

[diff] [blame]

1

/*

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

2

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

24

#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

25

26

#include "arm_compute/core/Error.h"

27

#include "arm_compute/core/Helpers.h"

28

#include "arm_compute/core/ITensor.h"

29

#include "arm_compute/core/KernelDescriptors.h"

30

#include "arm_compute/core/Types.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

31

#include "arm_compute/core/utils/misc/ShapeCalculator.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

32

#include "arm_compute/core/Validate.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

33

#include "arm_compute/runtime/NEON/NEScheduler.h"

34

#include "arm_compute/runtime/TensorAllocator.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

35

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

36

#include "src/common/utils/Log.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

37

#include "src/core/helpers/AutoConfiguration.h"

38

#include "src/core/helpers/MemoryHelpers.h"

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

39

#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"

40

#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"

41

#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"

42

#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"

43

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"

44

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"

45

#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"

46

#include "src/cpu/operators/CpuActivation.h"

47

#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"

48

#include "src/cpu/utils/CpuAuxTensorHandler.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

49

50

using namespace arm_compute::misc::shape_calculator;

51

using namespace arm_compute::experimental;

52

53

namespace arm_compute

{

namespace cpu

{

namespace

{

cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)

60

{

61

cpu::AsmGemmInfo asm_info;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

62

asm_info.method = cpu::AsmConvMethod::Im2Col;

63

asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();

64

asm_info.depth_output_gemm3d = info.depth_output_gemm3d();

65

asm_info.activation_info = info.activation_info();

66

asm_info.output_stage = info.gemmlowp_output_stage();

67

asm_info.fast_mode = info.fast_math();

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

68

asm_info.accumulate = info.accumulate();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

return asm_info;

}

} // namespace

CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()

75

: _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),

76

_mm_kernel(),

77

_mtx_a_reshape_kernel(),

78

_mtx_b_reshape_kernel(),

79

_mtx_a_reduction_kernel(),

80

_mtx_b_reduction_kernel(),

81

_offset_contribution_kernel(),

82

_offset_contribution_output_stage_kernel(),

83

_activation_func(),

84

_convert_to_signed_asymm(),

85

_convert_from_signed_asymm(),

_vector_sum_col(),

_vector_sum_row(),

_tmp_a(),

_tmp_b(),

_mm_result_s32(),

_signed_a(),

_signed_output(),

_a_offset(0),

_b_offset(0),

_run_vector_matrix_multiplication(false),

96

_assembly_path(false),

97

_fused_assembly_path(false),

98

_reshape_b_only_on_first_run(false),

99

_is_prepared(false),

100

_fuse_output_stage(false),

101

_run_activation(false),

102

_flip_signedness(false),

_gemm_info(),

_aux_mem(Count)

{

}

CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;

108

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

109

void CpuGemmLowpMatrixMultiplyCore::configure(

110

const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

111

{

112

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);

113

ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

114

ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

115

116

const ITensorInfo *matrix_a = a;

117

const ITensorInfo *matrix_b = b;

118

GEMMInfo info = gemm_info;

119

120

// Set internal variables

121

_a_offset = a->quantization_info().uniform().offset;

122

_b_offset = b->quantization_info().uniform().offset;

123

_run_vector_matrix_multiplication = a->dimension(1) < 2;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

124

_reshape_b_only_on_first_run = b->are_values_constant();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

125

_is_prepared = false;

126

_fused_assembly_path = false;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

127

_flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&

128

_reshape_b_only_on_first_run;

129

_gemm_info = gemm_info;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

130

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

131

const ITensorInfo *a_to_use = a;

132

133

// Initialize assembly kernel meta-data

134

const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);

135

136

const int32_t offset_correction = 128;

137

const DataType dt = DataType::QASYMM8_SIGNED;

138

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

139

140

_signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

141

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

142

143

// If inputs are mixed-sign but this machine does not support mixed sign kernels,

144

// flip the sign so matched-sign kernels can be used.

145

if (!_flip_signedness && a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED &&

146

!bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, dst, asm_info)))

147

{

148

_flip_signedness = true;

149

}

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

150

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

151

_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();

152

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

153

// Convert to QASYMM8 -> QASYMM8_SIGNED and back

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

154

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

155

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

156

_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

157

_convert_to_signed_asymm->configure(a_to_use, &_signed_a);

158

a_to_use = &_signed_a;

159

_a_offset = _signed_a.quantization_info().uniform().offset;

160

161

const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

162

_signed_output = dst->clone()->set_data_type(dt).set_quantization_info(

163

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

164

165

// Output stage correction

166

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

167

output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;

168

output_stage_corr.gemmlowp_min_bound -= offset_correction;

169

output_stage_corr.gemmlowp_max_bound -= offset_correction;

170

info.set_gemmlowp_output_stage(output_stage_corr);

171

172

// Update matrix a

173

matrix_a = &_signed_a;

174

}

175

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

176

// Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).

177

// It is not needed if the datatype is symmetric, because there is no offset

178

bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic();

179

bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic();

180

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

181

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

182

if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

183

{

184

_fuse_output_stage = true;

185

_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);

186

}

187

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

188

#ifdef __aarch64__

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

189

if (!(!b->are_values_constant() &&

190

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

191

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

192

switch (a->data_type())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

193

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

194

case DataType::QASYMM8:

195

case DataType::QASYMM8_SIGNED:

196

case DataType::U8:

197

case DataType::S8:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

198

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

199

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

200

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

201

{

202

auto c_info_to_use = c == nullptr ? nullptr : c;

203

_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);

204

_fused_assembly_path = _asm_glue->is_configured();

}

else

{

auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);

209

_asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);

210

}

211

_assembly_path = _asm_glue->is_configured();

212

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

213

}

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

214

default:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

215

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

216

ARM_COMPUTE_ERROR("Datatype not supported");

217

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

218

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

219

}

220

}

221

#endif /* __aarch64__ */

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

222

if (!(_assembly_path || _run_vector_matrix_multiplication))

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

{

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

228

_tmp_a =

229

TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

230

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

231

_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());

232

233

// Configure interleave kernel

234

_mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();

235

_mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);

236

237

// Configure transpose kernel

238

_mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();

239

_mtx_b_reshape_kernel->configure(b, &_tmp_b);

240

}

241

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

242

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

243

{

244

// Build reduction info

245

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

246

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

247

if (a_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

248

{

249

_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

250

251

// Configure Matrix B reduction kernel

252

_mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();

253

_mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);

254

}

255

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

256

if (b_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

257

{

258

_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);

259

260

// Configure matrix A reduction kernel

261

_mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();

262

_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);

263

}

264

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

265

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

266

{

267

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

268

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

269

{

270

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

271

_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);

272

}

273

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

274

_offset_contribution_output_stage_kernel =

275

std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();

276

_offset_contribution_output_stage_kernel->configure(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

277

&_mm_result_s32, a_offset_kernel_needed ? &_vector_sum_col : nullptr,

278

b_offset_kernel_needed ? &_vector_sum_row : nullptr, c, _flip_signedness ? &_signed_output : dst,

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

279

a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

280

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

281

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

282

{

283

_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

284

_convert_from_signed_asymm->configure(&_signed_output, dst);

}

}

else

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

289

// This scale is needed for the s8_f32 kernel where the multiplication output is dequantized to F32.

290

const float dequantize_scale =

291

(dst->data_type() == DataType::F32)

292

? a->quantization_info().uniform().scale * b->quantization_info().uniform().scale

293

: 1.0f;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

294

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

295

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

296

{

297

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

298

_mm_kernel->configure(matrix_a, matrix_b, dst);

299

}

300

// Configure offset contribution kernel

301

_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

302

_offset_contribution_kernel->configure(dst, a_offset_kernel_needed ? &_vector_sum_col : nullptr,

303

b_offset_kernel_needed ? &_vector_sum_row : nullptr,

304

a_to_use->dimension(0), _a_offset, _b_offset, dequantize_scale);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

305

}

306

}

307

// Configure activation

308

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

309

_run_activation =

310

activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));

311

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

312

{

313

_activation_func = std::make_unique<CpuActivation>();

314

_activation_func->configure(dst, nullptr, activation);

315

}

316

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

317

if (_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

318

{

SiCong Li

c5ab4df

2023-10-17 17:38:57 +0100

[diff] [blame]

319

const auto asm_mem_req = _asm_glue->workspace();

320

for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)

321

{

322

_aux_mem[slot] = asm_mem_req[slot];

323

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

324

}

325

326

// Request memory for LHS and RHS reshape matrix

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

327

_aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol),

328

!_fused_assembly_path && a_offset_kernel_needed && _reshape_b_only_on_first_run

329

? MemoryLifetime::Persistent

330

: MemoryLifetime::Temporary,

331

_vector_sum_col.total_size());

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

332

_aux_mem[VectorSumRow] =

333

MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());

334

_aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());

335

_aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),

336

_reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,

337

_tmp_b.total_size());

338

_aux_mem[MMResultS32] =

339

MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());

340

_aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());

341

_aux_mem[SignedOutput] =

342

MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

343

}

344

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

345

Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,

346

const ITensorInfo *b,

347

const ITensorInfo *c,

348

const ITensorInfo *output,

349

const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

350

{

351

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

352

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

353

DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);

354

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

355

DataType::QASYMM8_SIGNED, DataType::F32);

356

ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && output->data_type() != DataType::F32 &&

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

357

gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,

358

"Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");

359

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

360

(a)->dimension(0) != (b)->dimension(1),

361

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

362

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

363

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

364

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

365

// When using accumulation(in place summation), for now, the only supported DataType for output is S32.

366

if (gemm_info.accumulate())

367

{

Radu Salavat

cdce25b

2024-04-12 12:26:50 +0000

[diff] [blame]

368

#ifdef __arm__

369

ARM_COMPUTE_RETURN_ERROR_MSG("Accumulation is not supported for armv7");

370

#endif /* __arm__ */

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

371

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE,

372

"Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED");

373

}

374

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

375

GEMMInfo info = gemm_info;

376

const ITensorInfo *matrix_a_info = a;

377

const ITensorInfo *matrix_b_info = b;

378

379

const ITensorInfo *a_to_use = a;

380

381

TensorInfo tmp_a_info{};

382

TensorInfo tmp_b_info{};

383

TensorInfo mm_result_s32_info{};

384

385

int32_t a_offset = a->quantization_info().uniform().offset;

386

int32_t b_offset = b->quantization_info().uniform().offset;

387

388

bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

389

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

390

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

391

auto_init_if_empty(mm_result_s32_info,

392

a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

393

}

394

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

395

// Initialize assembly kernel meta-data

396

const AsmGemmInfo asm_info = init_assembly_metadata(info);

397

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

398

// Convert QASYMM8->QASYMM8_SIGNED

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

399

const int32_t offset_correction = 128;

400

const DataType dt = DataType::QASYMM8_SIGNED;

401

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

402

403

TensorInfo signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

404

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

405

TensorInfo signed_output{};

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

406

407

bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

408

(a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

409

410

// If inputs are mixed-sign but this machine does not support mixed sign kernels,

411

// flip the sign so matched-sign kernels can be used.

412

if (!flip_signedness && a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED &&

413

!bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)))

414

{

415

flip_signedness = true;

416

}

417

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

418

if (flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

419

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

420

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));

421

a_to_use = &signed_a;

422

a_offset = signed_a.quantization_info().uniform().offset;

423

424

const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

425

signed_output = output->clone()->set_data_type(dt).set_quantization_info(

426

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

427

428

// Output stage correction

429

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

430

output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;

431

output_stage_corr.gemmlowp_min_bound -= offset_correction;

432

output_stage_corr.gemmlowp_max_bound -= offset_correction;

433

info.set_gemmlowp_output_stage(output_stage_corr);

434

435

// Update matrix a

436

matrix_a_info = &signed_a;

437

}

438

Michael Tyler

2024-06-04 15:47:37 +0100

[diff] [blame]

439

// Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).

440

bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic();

441

bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

442

443

// Check if we need to run the optimized assembly kernel

444

bool run_optimised = false;

445

bool run_optimised_requantized = false;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

446

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

447

if (!(!b->are_values_constant() &&

448

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

449

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

450

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

451

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

452

{

453

run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));

454

run_optimised_requantized = run_optimised;

455

}

456

else

457

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

458

run_optimised = bool(CpuGemmAssemblyDispatch::validate(

459

a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

460

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

461

}

462

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

463

if (run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

464

{

465

ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

466

if (info.depth_output_gemm3d() != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

467

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

468

if (info.reinterpret_input_as_3d())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

469

{

470

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

471

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));

}

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

}

}

else

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

485

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),

486

"NEGEMM cannot reinterpret the input tensor as 3D");

487

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,

488

"NEGEMM cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

489

490

const bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

491

if (!run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

492

{

493

matrix_a_info = &tmp_a_info;

494

matrix_b_info = &tmp_b_info;

495

496

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

497

TensorShape shape_tmp_a = a->tensor_shape();

498

shape_tmp_a.set(0, a->dimension(0) * 4);

499

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

500

501

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

502

TensorShape shape_tmp_b = b->tensor_shape();

503

shape_tmp_b.set(0, b->dimension(1) * 16);

504

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

505

506

// Validate interleave kernel

507

auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));

508

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));

509

510

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));

511

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));

512

}

513

}

514

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

515

if (!run_optimised_requantized)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

516

{

517

TensorInfo info_vector_sum_col{};

518

TensorInfo info_vector_sum_row{};

519

520

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

521

522

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

523

if (a_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

524

{

525

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

526

527

// Configure Matrix B reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

528

ARM_COMPUTE_RETURN_ON_ERROR(

529

kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

530

}

531

532

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

533

if (b_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

534

{

535

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

536

537

// Configure matrix A reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

538

ARM_COMPUTE_RETURN_ON_ERROR(

539

kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

540

}

541

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

542

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

543

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

544

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

545

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

546

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

547

info.reinterpret_input_as_3d(),

548

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

549

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

550

info.depth_output_gemm3d() != 0,

551

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

552

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

553

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(

554

matrix_a_info, matrix_b_info, &mm_result_s32_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

555

}

556

557

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

558

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

559

&mm_result_s32_info, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,

560

b_offset_kernel_needed ? &info_vector_sum_row : nullptr, c, flip_signedness ? &signed_output : output,

561

a_offset, b_offset, info.gemmlowp_output_stage()));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

562

}

563

else

564

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

565

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

566

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

567

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

568

info.reinterpret_input_as_3d(),

569

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

570

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

571

info.depth_output_gemm3d() != 0,

572

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

573

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

574

ARM_COMPUTE_RETURN_ON_ERROR(

575

kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

576

}

577

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

578

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

579

output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,

580

b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Validate activation

585

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

586

if (activation.enabled())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

587

{

588

ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));

}

return Status{};

}

void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)

595

{

596

prepare(tensors);

Georgios Pinitas

22f5ed5

2021-07-23 18:58:43 +0100

[diff] [blame]

597

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

598

auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);

599

auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

600

auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);

601

auto dst = tensors.get_tensor(TensorType::ACL_DST);

auto a_to_use = a;

auto matrix_a = a;

auto matrix_b = b;

CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);

607

CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);

608

CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);

609

CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);

610

CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);

611

CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);

612

CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);

613

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

614

const QuantizationInfo a_qinfo = a->info()->quantization_info();

615

const QuantizationInfo b_qinfo = b->info()->quantization_info();

616

617

if (a_qinfo.is_dynamic())

618

_a_offset = a_qinfo.uniform().offset;

619

if (b_qinfo.is_dynamic())

620

_b_offset = b_qinfo.uniform().offset;

621

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

622

// Convert QASYMM8->QASYMM8_SIGNED

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

623

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

624

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

625

ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};

626

NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),

627

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

628

a_to_use = signed_a.get();

Georgios Pinitas

d4a5bc5

2021-08-12 07:42:51 +0100

[diff] [blame]

629

matrix_a = signed_a.get();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

630

}

631

632

// Run GEMM

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

633

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

634

{

635

ITensorPack asm_glue_tensors = tensors;

636

auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

637

if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&

638

_gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

639

{

640

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

641

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

642

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);

643

asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);

}

else

{

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

648

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

649

asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);

650

}

651

_asm_glue->run(asm_glue_tensors);

652

}

653

else

654

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

655

if (!_run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

656

{

657

matrix_a = tmp_a.get();

658

matrix_b = tmp_b.get();

659

// Run interleave kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

660

ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};

661

NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),

662

pack_a);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

663

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

664

if (!_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

665

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

666

ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

667

// Run transpose kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

668

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,

669

_mtx_b_reshape_kernel->window(), pack_b);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

670

}

671

}

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

672

ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};

673

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

674

{

675

pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());

}

else

{

pack_mm.add_tensor(TensorType::ACL_DST, dst);

680

}

681

NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);

682

}

683

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

684

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

685

{

686

// Run matrix A reduction kernel only if _b_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

687

if (_b_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

688

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

689

ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};

690

NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,

691

_mtx_a_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

692

}

693

694

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

695

if (_a_offset != 0 && !_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

696

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

697

ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};

698

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

699

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

700

}

701

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

702

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

703

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

704

if (a_qinfo.is_dynamic())

705

_offset_contribution_output_stage_kernel->set_a_offset(_a_offset);

706

if (b_qinfo.is_dynamic())

707

_offset_contribution_output_stage_kernel->set_b_offset(_b_offset);

708

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

709

ITensorPack pack;

710

pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());

711

pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());

712

pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());

713

pack.add_tensor(TensorType::ACL_SRC_3, c);

714

pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);

715

716

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

717

NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,

718

_offset_contribution_output_stage_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

719

}

720

else

721

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

722

if (a_qinfo.is_dynamic())

723

_offset_contribution_kernel->set_a_offset(_a_offset);

724

if (b_qinfo.is_dynamic())

725

_offset_contribution_kernel->set_b_offset(_b_offset);

726

if (a_qinfo.is_dynamic() || b_qinfo.is_dynamic())

727

{

728

const float dequantize_scale = a_qinfo.uniform().scale * b_qinfo.uniform().scale;

729

_offset_contribution_kernel->set_scale(dequantize_scale);

730

}

731

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

732

ITensorPack pack;

733

pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());

734

pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());

735

pack.add_tensor(TensorType::ACL_DST, dst);

736

737

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

738

NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,

739

_offset_contribution_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Convert QASYMM8_SIGNED->QASYMM8

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

744

if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

745

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

746

ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};

747

NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,

748

_convert_from_signed_asymm->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

749

}

750

751

// Run fused activation unless already run in the fused assembly

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

752

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

753

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

754

ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

755

_activation_func->run(pack);

}

}

void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)

760

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

761

if (!_is_prepared)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

762

{

763

auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

764

// Run assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

765

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

766

{

767

_asm_glue->prepare(tensors);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

768

}

769

// Run non-assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

770

else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

771

{

772

// Run reshape kernel and mark original weights tensor as unused

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

773

ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

774

CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

775

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};

776

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),

777

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

778

}

779

780

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

781

if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

782

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

783

ITensor *vector_sum_col_p =

784

utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

785

CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

786

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};

787

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

788

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini