Blame - src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2021-06-18 15:47:28 +0100

[diff] [blame]

1

/*

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

2

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

24

#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

25

26

#include "arm_compute/core/Error.h"

27

#include "arm_compute/core/Helpers.h"

28

#include "arm_compute/core/ITensor.h"

29

#include "arm_compute/core/KernelDescriptors.h"

30

#include "arm_compute/core/Types.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

31

#include "arm_compute/core/utils/misc/ShapeCalculator.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

32

#include "arm_compute/core/Validate.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

33

#include "arm_compute/runtime/NEON/NEScheduler.h"

34

#include "arm_compute/runtime/TensorAllocator.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

35

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

36

#include "src/common/utils/Log.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

37

#include "src/core/helpers/AutoConfiguration.h"

38

#include "src/core/helpers/MemoryHelpers.h"

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

39

#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"

40

#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"

41

#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"

42

#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"

43

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"

44

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"

45

#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"

46

#include "src/cpu/operators/CpuActivation.h"

47

#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"

48

#include "src/cpu/utils/CpuAuxTensorHandler.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

49

50

using namespace arm_compute::misc::shape_calculator;

51

using namespace arm_compute::experimental;

52

53

namespace arm_compute

{

namespace cpu

{

namespace

{

cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)

60

{

61

cpu::AsmGemmInfo asm_info;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

62

asm_info.method = cpu::AsmConvMethod::Im2Col;

63

asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();

64

asm_info.depth_output_gemm3d = info.depth_output_gemm3d();

65

asm_info.activation_info = info.activation_info();

66

asm_info.output_stage = info.gemmlowp_output_stage();

67

asm_info.fast_mode = info.fast_math();

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

68

asm_info.accumulate = info.accumulate();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

return asm_info;

}

} // namespace

CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()

75

: _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),

76

_mm_kernel(),

77

_mtx_a_reshape_kernel(),

78

_mtx_b_reshape_kernel(),

79

_mtx_a_reduction_kernel(),

80

_mtx_b_reduction_kernel(),

81

_offset_contribution_kernel(),

82

_offset_contribution_output_stage_kernel(),

83

_activation_func(),

84

_convert_to_signed_asymm(),

85

_convert_from_signed_asymm(),

_vector_sum_col(),

_vector_sum_row(),

_tmp_a(),

_tmp_b(),

_mm_result_s32(),

_signed_a(),

_signed_output(),

_a_offset(0),

_b_offset(0),

_run_vector_matrix_multiplication(false),

96

_assembly_path(false),

97

_fused_assembly_path(false),

98

_reshape_b_only_on_first_run(false),

99

_is_prepared(false),

100

_fuse_output_stage(false),

101

_run_activation(false),

102

_flip_signedness(false),

_gemm_info(),

_aux_mem(Count)

{

}

CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;

108

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

109

void CpuGemmLowpMatrixMultiplyCore::configure(

110

const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

111

{

112

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);

113

ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

114

ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

115

116

const ITensorInfo *matrix_a = a;

117

const ITensorInfo *matrix_b = b;

118

GEMMInfo info = gemm_info;

119

120

// Set internal variables

121

_a_offset = a->quantization_info().uniform().offset;

122

_b_offset = b->quantization_info().uniform().offset;

123

_run_vector_matrix_multiplication = a->dimension(1) < 2;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

124

_reshape_b_only_on_first_run = b->are_values_constant();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

125

_is_prepared = false;

126

_fused_assembly_path = false;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

127

_flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&

128

_reshape_b_only_on_first_run;

129

_gemm_info = gemm_info;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

130

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

131

// Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).

132

// It is not needed if the datatype is symmetric, because there is no offset

133

bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic();

134

bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic();

135

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

136

_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();

137

138

const ITensorInfo *a_to_use = a;

139

140

// Convert to QASYMM8 -> QASYMM8_SIGNED and back

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

141

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

142

{

143

const int32_t offset_correction = 128;

144

const DataType dt = DataType::QASYMM8_SIGNED;

145

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

146

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

147

_signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

148

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

149

_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

150

_convert_to_signed_asymm->configure(a_to_use, &_signed_a);

151

a_to_use = &_signed_a;

152

_a_offset = _signed_a.quantization_info().uniform().offset;

153

154

const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

155

_signed_output = dst->clone()->set_data_type(dt).set_quantization_info(

156

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

157

158

// Output stage correction

159

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

160

output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;

161

output_stage_corr.gemmlowp_min_bound -= offset_correction;

162

output_stage_corr.gemmlowp_max_bound -= offset_correction;

163

info.set_gemmlowp_output_stage(output_stage_corr);

164

165

// Update matrix a

166

matrix_a = &_signed_a;

167

}

168

169

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

170

if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

171

{

172

_fuse_output_stage = true;

173

_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);

174

}

175

176

// Initialize assembly kernel meta-data

177

const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);

178

#ifdef __aarch64__

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

179

if (!(!b->are_values_constant() &&

180

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

181

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

182

switch (a->data_type())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

183

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

184

case DataType::QASYMM8:

185

case DataType::QASYMM8_SIGNED:

186

case DataType::U8:

187

case DataType::S8:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

188

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

189

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

190

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

191

{

192

auto c_info_to_use = c == nullptr ? nullptr : c;

193

_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);

194

_fused_assembly_path = _asm_glue->is_configured();

}

else

{

auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);

199

_asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);

200

}

201

_assembly_path = _asm_glue->is_configured();

202

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

203

}

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

204

default:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

205

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

206

ARM_COMPUTE_ERROR("Datatype not supported");

207

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

208

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

209

}

210

}

211

#endif /* __aarch64__ */

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

212

if (!(_assembly_path || _run_vector_matrix_multiplication))

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

{

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

218

_tmp_a =

219

TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

220

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

221

_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());

222

223

// Configure interleave kernel

224

_mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();

225

_mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);

226

227

// Configure transpose kernel

228

_mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();

229

_mtx_b_reshape_kernel->configure(b, &_tmp_b);

230

}

231

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

232

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

233

{

234

// Build reduction info

235

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

236

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

237

if (a_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

238

{

239

_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

240

241

// Configure Matrix B reduction kernel

242

_mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();

243

_mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);

244

}

245

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

246

if (b_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

247

{

248

_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);

249

250

// Configure matrix A reduction kernel

251

_mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();

252

_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);

253

}

254

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

255

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

256

{

257

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

258

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

259

{

260

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

261

_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);

262

}

263

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

264

_offset_contribution_output_stage_kernel =

265

std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();

266

_offset_contribution_output_stage_kernel->configure(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

267

&_mm_result_s32, a_offset_kernel_needed ? &_vector_sum_col : nullptr,

268

b_offset_kernel_needed ? &_vector_sum_row : nullptr, c, _flip_signedness ? &_signed_output : dst,

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

269

a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

270

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

271

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

272

{

273

_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

274

_convert_from_signed_asymm->configure(&_signed_output, dst);

}

}

else

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

279

// This scale is needed for the s8_f32 kernel where the multiplication output is dequantized to F32.

280

const float dequantize_scale =

281

(dst->data_type() == DataType::F32)

282

? a->quantization_info().uniform().scale * b->quantization_info().uniform().scale

283

: 1.0f;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

284

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

285

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

286

{

287

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

288

_mm_kernel->configure(matrix_a, matrix_b, dst);

289

}

290

// Configure offset contribution kernel

291

_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

292

_offset_contribution_kernel->configure(dst, a_offset_kernel_needed ? &_vector_sum_col : nullptr,

293

b_offset_kernel_needed ? &_vector_sum_row : nullptr,

294

a_to_use->dimension(0), _a_offset, _b_offset, dequantize_scale);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

295

}

296

}

297

// Configure activation

298

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

299

_run_activation =

300

activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));

301

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

302

{

303

_activation_func = std::make_unique<CpuActivation>();

304

_activation_func->configure(dst, nullptr, activation);

305

}

306

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

307

if (_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

308

{

SiCong Li

c5ab4df

2023-10-17 17:38:57 +0100

[diff] [blame]

309

const auto asm_mem_req = _asm_glue->workspace();

310

for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)

311

{

312

_aux_mem[slot] = asm_mem_req[slot];

313

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

314

}

315

316

// Request memory for LHS and RHS reshape matrix

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

317

_aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol),

318

!_fused_assembly_path && a_offset_kernel_needed && _reshape_b_only_on_first_run

319

? MemoryLifetime::Persistent

320

: MemoryLifetime::Temporary,

321

_vector_sum_col.total_size());

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

322

_aux_mem[VectorSumRow] =

323

MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());

324

_aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());

325

_aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),

326

_reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,

327

_tmp_b.total_size());

328

_aux_mem[MMResultS32] =

329

MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());

330

_aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());

331

_aux_mem[SignedOutput] =

332

MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

333

}

334

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

335

Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,

336

const ITensorInfo *b,

337

const ITensorInfo *c,

338

const ITensorInfo *output,

339

const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

340

{

341

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

342

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

343

DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);

344

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

345

DataType::QASYMM8_SIGNED, DataType::F32);

346

ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && output->data_type() != DataType::F32 &&

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

347

gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,

348

"Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");

349

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

350

(a)->dimension(0) != (b)->dimension(1),

351

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

352

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

353

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

354

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

355

// When using accumulation(in place summation), for now, the only supported DataType for output is S32.

356

if (gemm_info.accumulate())

357

{

Radu Salavat

cdce25b

2024-04-12 12:26:50 +0000

[diff] [blame]

358

#ifdef __arm__

359

ARM_COMPUTE_RETURN_ERROR_MSG("Accumulation is not supported for armv7");

360

#endif /* __arm__ */

Radu Salavat

2024-02-27 18:32:26 +0000

[diff] [blame]

361

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE,

362

"Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED");

363

}

364

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

365

GEMMInfo info = gemm_info;

366

const ITensorInfo *matrix_a_info = a;

367

const ITensorInfo *matrix_b_info = b;

368

369

const ITensorInfo *a_to_use = a;

370

371

TensorInfo tmp_a_info{};

372

TensorInfo tmp_b_info{};

373

TensorInfo mm_result_s32_info{};

374

375

int32_t a_offset = a->quantization_info().uniform().offset;

376

int32_t b_offset = b->quantization_info().uniform().offset;

377

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

378

// Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).

379

bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic();

380

bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic();

381

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

382

bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

383

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

384

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

385

auto_init_if_empty(mm_result_s32_info,

386

a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

387

}

388

389

// Convert QASYMM8->QASYMM8_SIGNED

390

TensorInfo signed_a{};

391

TensorInfo signed_output{};

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

392

bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&

393

(a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();

394

if (flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

395

{

396

const int32_t offset_correction = 128;

397

const DataType dt = DataType::QASYMM8_SIGNED;

398

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

399

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

400

signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

401

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

402

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));

403

a_to_use = &signed_a;

404

a_offset = signed_a.quantization_info().uniform().offset;

405

406

const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

407

signed_output = output->clone()->set_data_type(dt).set_quantization_info(

408

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

409

410

// Output stage correction

411

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

412

output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;

413

output_stage_corr.gemmlowp_min_bound -= offset_correction;

414

output_stage_corr.gemmlowp_max_bound -= offset_correction;

415

info.set_gemmlowp_output_stage(output_stage_corr);

416

417

// Update matrix a

418

matrix_a_info = &signed_a;

419

}

420

421

// Initialize assembly kernel meta-data

422

const AsmGemmInfo asm_info = init_assembly_metadata(info);

423

424

// Check if we need to run the optimized assembly kernel

425

bool run_optimised = false;

426

bool run_optimised_requantized = false;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

427

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

428

if (!(!b->are_values_constant() &&

429

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

430

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

431

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

432

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

433

{

434

run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));

435

run_optimised_requantized = run_optimised;

436

}

437

else

438

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

439

run_optimised = bool(CpuGemmAssemblyDispatch::validate(

440

a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

441

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

442

}

443

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

444

if (run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

445

{

446

ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

447

if (info.depth_output_gemm3d() != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

448

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

449

if (info.reinterpret_input_as_3d())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

450

{

451

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

452

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));

}

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

}

}

else

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

466

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),

467

"NEGEMM cannot reinterpret the input tensor as 3D");

468

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,

469

"NEGEMM cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

470

471

const bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

472

if (!run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

473

{

474

matrix_a_info = &tmp_a_info;

475

matrix_b_info = &tmp_b_info;

476

477

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

478

TensorShape shape_tmp_a = a->tensor_shape();

479

shape_tmp_a.set(0, a->dimension(0) * 4);

480

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

481

482

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

483

TensorShape shape_tmp_b = b->tensor_shape();

484

shape_tmp_b.set(0, b->dimension(1) * 16);

485

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

486

487

// Validate interleave kernel

488

auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));

489

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));

490

491

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));

492

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));

493

}

494

}

495

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

496

if (!run_optimised_requantized)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

497

{

498

TensorInfo info_vector_sum_col{};

499

TensorInfo info_vector_sum_row{};

500

501

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

502

503

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

504

if (a_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

505

{

506

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

507

508

// Configure Matrix B reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

509

ARM_COMPUTE_RETURN_ON_ERROR(

510

kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

511

}

512

513

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

514

if (b_offset_kernel_needed)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

515

{

516

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

517

518

// Configure matrix A reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

519

ARM_COMPUTE_RETURN_ON_ERROR(

520

kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

521

}

522

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

523

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

524

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

525

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

526

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

527

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

528

info.reinterpret_input_as_3d(),

529

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

530

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

531

info.depth_output_gemm3d() != 0,

532

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

533

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

534

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(

535

matrix_a_info, matrix_b_info, &mm_result_s32_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

536

}

537

538

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

539

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

540

&mm_result_s32_info, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,

541

b_offset_kernel_needed ? &info_vector_sum_row : nullptr, c, flip_signedness ? &signed_output : output,

542

a_offset, b_offset, info.gemmlowp_output_stage()));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

543

}

544

else

545

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

546

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

547

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

548

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

549

info.reinterpret_input_as_3d(),

550

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

551

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

552

info.depth_output_gemm3d() != 0,

553

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

554

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

555

ARM_COMPUTE_RETURN_ON_ERROR(

556

kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

557

}

558

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

559

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

560

output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,

561

b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Validate activation

566

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

567

if (activation.enabled())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

568

{

569

ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));

}

return Status{};

}

void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)

576

{

577

prepare(tensors);

Georgios Pinitas

22f5ed5

2021-07-23 18:58:43 +0100

[diff] [blame]

578

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

579

auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);

580

auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

581

auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);

582

auto dst = tensors.get_tensor(TensorType::ACL_DST);

auto a_to_use = a;

auto matrix_a = a;

auto matrix_b = b;

CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);

588

CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);

589

CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);

590

CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);

591

CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);

592

CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);

593

CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);

594

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

595

const QuantizationInfo a_qinfo = a->info()->quantization_info();

596

const QuantizationInfo b_qinfo = b->info()->quantization_info();

597

598

if (a_qinfo.is_dynamic())

599

_a_offset = a_qinfo.uniform().offset;

600

if (b_qinfo.is_dynamic())

601

_b_offset = b_qinfo.uniform().offset;

602

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

603

// Convert QASYMM8->QASYMM8_SIGNED

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

604

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

605

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

606

ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};

607

NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),

608

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

609

a_to_use = signed_a.get();

Georgios Pinitas

d4a5bc5

2021-08-12 07:42:51 +0100

[diff] [blame]

610

matrix_a = signed_a.get();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

611

}

612

613

// Run GEMM

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

614

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

615

{

616

ITensorPack asm_glue_tensors = tensors;

617

auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

618

if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&

619

_gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

620

{

621

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

622

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

623

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);

624

asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);

}

else

{

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

629

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

630

asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);

631

}

632

_asm_glue->run(asm_glue_tensors);

633

}

634

else

635

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

636

if (!_run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

637

{

638

matrix_a = tmp_a.get();

639

matrix_b = tmp_b.get();

640

// Run interleave kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

641

ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};

642

NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),

643

pack_a);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

644

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

645

if (!_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

646

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

647

ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

648

// Run transpose kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

649

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,

650

_mtx_b_reshape_kernel->window(), pack_b);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

651

}

652

}

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

653

ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};

654

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

655

{

656

pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());

}

else

{

pack_mm.add_tensor(TensorType::ACL_DST, dst);

661

}

662

NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);

663

}

664

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

665

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

666

{

667

// Run matrix A reduction kernel only if _b_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

668

if (_b_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

669

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

670

ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};

671

NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,

672

_mtx_a_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

673

}

674

675

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

676

if (_a_offset != 0 && !_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

677

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

678

ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};

679

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

680

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

681

}

682

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

683

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

684

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

685

if (a_qinfo.is_dynamic())

686

_offset_contribution_output_stage_kernel->set_a_offset(_a_offset);

687

if (b_qinfo.is_dynamic())

688

_offset_contribution_output_stage_kernel->set_b_offset(_b_offset);

689

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

690

ITensorPack pack;

691

pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());

692

pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());

693

pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());

694

pack.add_tensor(TensorType::ACL_SRC_3, c);

695

pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);

696

697

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

698

NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,

699

_offset_contribution_output_stage_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

700

}

701

else

702

{

Jonathan Deakin

2024-01-24 09:15:38 +0000

[diff] [blame]

703

if (a_qinfo.is_dynamic())

704

_offset_contribution_kernel->set_a_offset(_a_offset);

705

if (b_qinfo.is_dynamic())

706

_offset_contribution_kernel->set_b_offset(_b_offset);

707

if (a_qinfo.is_dynamic() || b_qinfo.is_dynamic())

708

{

709

const float dequantize_scale = a_qinfo.uniform().scale * b_qinfo.uniform().scale;

710

_offset_contribution_kernel->set_scale(dequantize_scale);

711

}

712

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

713

ITensorPack pack;

714

pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());

715

pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());

716

pack.add_tensor(TensorType::ACL_DST, dst);

717

718

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

719

NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,

720

_offset_contribution_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Convert QASYMM8_SIGNED->QASYMM8

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

725

if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

726

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

727

ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};

728

NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,

729

_convert_from_signed_asymm->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

730

}

731

732

// Run fused activation unless already run in the fused assembly

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

733

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

734

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

735

ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

736

_activation_func->run(pack);

}

}

void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)

741

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

742

if (!_is_prepared)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

743

{

744

auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

745

// Run assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

746

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

747

{

748

_asm_glue->prepare(tensors);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

749

}

750

// Run non-assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

751

else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

752

{

753

// Run reshape kernel and mark original weights tensor as unused

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

754

ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

755

CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

756

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};

757

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),

758

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

759

}

760

761

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

762

if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

763

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

764

ITensor *vector_sum_col_p =

765

utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

766

CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

767

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};

768

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

769

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini