Blame - src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp - ml/ComputeLibrary

2021-06-18 15:47:28 +0100

[diff] [blame]

1

/*

Radu Salavat

f1f1f87

2024-02-27 18:32:26 +0000

[diff] [blame^]

2

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

24

#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

25

26

#include "arm_compute/core/Error.h"

27

#include "arm_compute/core/Helpers.h"

28

#include "arm_compute/core/ITensor.h"

29

#include "arm_compute/core/KernelDescriptors.h"

30

#include "arm_compute/core/Types.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

31

#include "arm_compute/core/utils/misc/ShapeCalculator.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

32

#include "arm_compute/core/Validate.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

33

#include "arm_compute/runtime/NEON/NEScheduler.h"

34

#include "arm_compute/runtime/TensorAllocator.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

35

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

36

#include "src/common/utils/Log.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

37

#include "src/core/helpers/AutoConfiguration.h"

38

#include "src/core/helpers/MemoryHelpers.h"

Georgios Pinitas

7891a73

2021-08-20 21:39:25 +0100

[diff] [blame]

39

#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"

40

#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"

41

#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"

42

#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"

43

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"

44

#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"

45

#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"

46

#include "src/cpu/operators/CpuActivation.h"

47

#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"

48

#include "src/cpu/utils/CpuAuxTensorHandler.h"

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

49

50

using namespace arm_compute::misc::shape_calculator;

51

using namespace arm_compute::experimental;

52

53

namespace arm_compute

{

namespace cpu

{

namespace

{

cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)

60

{

61

cpu::AsmGemmInfo asm_info;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

62

asm_info.method = cpu::AsmConvMethod::Im2Col;

63

asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();

64

asm_info.depth_output_gemm3d = info.depth_output_gemm3d();

65

asm_info.activation_info = info.activation_info();

66

asm_info.output_stage = info.gemmlowp_output_stage();

67

asm_info.fast_mode = info.fast_math();

Radu Salavat

f1f1f87

2024-02-27 18:32:26 +0000

[diff] [blame^]

68

asm_info.accumulate = info.accumulate();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

return asm_info;

}

} // namespace

CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()

75

: _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),

76

_mm_kernel(),

77

_mtx_a_reshape_kernel(),

78

_mtx_b_reshape_kernel(),

79

_mtx_a_reduction_kernel(),

80

_mtx_b_reduction_kernel(),

81

_offset_contribution_kernel(),

82

_offset_contribution_output_stage_kernel(),

83

_activation_func(),

84

_convert_to_signed_asymm(),

85

_convert_from_signed_asymm(),

_vector_sum_col(),

_vector_sum_row(),

_tmp_a(),

_tmp_b(),

_mm_result_s32(),

_signed_a(),

_signed_output(),

_a_offset(0),

_b_offset(0),

_run_vector_matrix_multiplication(false),

96

_assembly_path(false),

97

_fused_assembly_path(false),

98

_reshape_b_only_on_first_run(false),

99

_is_prepared(false),

100

_fuse_output_stage(false),

101

_run_activation(false),

102

_flip_signedness(false),

_gemm_info(),

_aux_mem(Count)

{

}

CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;

108

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

109

void CpuGemmLowpMatrixMultiplyCore::configure(

110

const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

111

{

112

ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);

113

ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));

ramelg01

3ae3d88

2021-09-12 23:07:47 +0100

[diff] [blame]

114

ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

115

116

const ITensorInfo *matrix_a = a;

117

const ITensorInfo *matrix_b = b;

118

GEMMInfo info = gemm_info;

119

120

// Set internal variables

121

_a_offset = a->quantization_info().uniform().offset;

122

_b_offset = b->quantization_info().uniform().offset;

123

_run_vector_matrix_multiplication = a->dimension(1) < 2;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

124

_reshape_b_only_on_first_run = b->are_values_constant();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

125

_is_prepared = false;

126

_fused_assembly_path = false;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

127

_flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&

128

_reshape_b_only_on_first_run;

129

_gemm_info = gemm_info;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

130

131

_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();

132

133

const ITensorInfo *a_to_use = a;

134

135

// Convert to QASYMM8 -> QASYMM8_SIGNED and back

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

136

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

137

{

138

const int32_t offset_correction = 128;

139

const DataType dt = DataType::QASYMM8_SIGNED;

140

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

141

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

142

_signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

143

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

144

_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

145

_convert_to_signed_asymm->configure(a_to_use, &_signed_a);

146

a_to_use = &_signed_a;

147

_a_offset = _signed_a.quantization_info().uniform().offset;

148

149

const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

150

_signed_output = dst->clone()->set_data_type(dt).set_quantization_info(

151

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

152

153

// Output stage correction

154

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

155

output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;

156

output_stage_corr.gemmlowp_min_bound -= offset_correction;

157

output_stage_corr.gemmlowp_max_bound -= offset_correction;

158

info.set_gemmlowp_output_stage(output_stage_corr);

159

160

// Update matrix a

161

matrix_a = &_signed_a;

162

}

163

164

// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

165

if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

166

{

167

_fuse_output_stage = true;

168

_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);

169

}

170

171

// Initialize assembly kernel meta-data

172

const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);

173

#ifdef __aarch64__

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

174

if (!(!b->are_values_constant() &&

175

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

176

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

177

switch (a->data_type())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

178

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

179

case DataType::QASYMM8:

180

case DataType::QASYMM8_SIGNED:

181

case DataType::U8:

182

case DataType::S8:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

183

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

184

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

185

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

186

{

187

auto c_info_to_use = c == nullptr ? nullptr : c;

188

_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);

189

_fused_assembly_path = _asm_glue->is_configured();

}

else

{

auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);

194

_asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);

195

}

196

_assembly_path = _asm_glue->is_configured();

197

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

198

}

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

199

default:

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

200

{

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

201

ARM_COMPUTE_ERROR("Datatype not supported");

202

break;

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

203

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

204

}

205

}

206

#endif /* __aarch64__ */

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

207

if (!(_assembly_path || _run_vector_matrix_multiplication))

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

{

matrix_a = &_tmp_a;

matrix_b = &_tmp_b;

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

213

_tmp_a =

214

TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

215

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

216

_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());

217

218

// Configure interleave kernel

219

_mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();

220

_mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);

221

222

// Configure transpose kernel

223

_mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();

224

_mtx_b_reshape_kernel->configure(b, &_tmp_b);

225

}

226

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

227

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

228

{

229

// Build reduction info

230

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

231

232

// Initialize matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

233

if (_a_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

234

{

235

_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

236

237

// Configure Matrix B reduction kernel

238

_mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();

239

_mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);

240

}

241

242

// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

243

if (_b_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

244

{

245

_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);

246

247

// Configure matrix A reduction kernel

248

_mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();

249

_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);

250

}

251

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

252

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

253

{

254

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

255

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

256

{

257

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

258

_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);

259

}

260

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

261

_offset_contribution_output_stage_kernel =

262

std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();

263

_offset_contribution_output_stage_kernel->configure(

264

&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,

265

_b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,

266

a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

267

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

268

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

269

{

270

_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();

271

_convert_from_signed_asymm->configure(&_signed_output, dst);

}

}

else

{

// Configure matrix multiply kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

277

if (!_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

278

{

279

_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();

280

_mm_kernel->configure(matrix_a, matrix_b, dst);

281

}

282

// Configure offset contribution kernel

283

_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

284

_offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,

285

_b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

286

_a_offset, _b_offset);

287

}

288

}

289

// Configure activation

290

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

291

_run_activation =

292

activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));

293

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

294

{

295

_activation_func = std::make_unique<CpuActivation>();

296

_activation_func->configure(dst, nullptr, activation);

297

}

298

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

299

if (_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

300

{

SiCong Li

c5ab4df

2023-10-17 17:38:57 +0100

[diff] [blame]

301

const auto asm_mem_req = _asm_glue->workspace();

302

for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)

303

{

304

_aux_mem[slot] = asm_mem_req[slot];

305

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

306

}

307

308

// Request memory for LHS and RHS reshape matrix

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

309

_aux_mem[VectorSumCol] =

310

MemoryInfo(offset_int_vec(VectorSumCol),

311

!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent

312

: MemoryLifetime::Temporary,

313

_vector_sum_col.total_size());

314

_aux_mem[VectorSumRow] =

315

MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());

316

_aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());

317

_aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),

318

_reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,

319

_tmp_b.total_size());

320

_aux_mem[MMResultS32] =

321

MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());

322

_aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());

323

_aux_mem[SignedOutput] =

324

MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

325

}

326

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

327

Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,

328

const ITensorInfo *b,

329

const ITensorInfo *c,

330

const ITensorInfo *output,

331

const GEMMInfo &gemm_info)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

332

{

333

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

334

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

335

DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);

336

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,

337

DataType::QASYMM8_SIGNED);

338

ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&

339

gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,

340

"Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");

341

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

342

(a)->dimension(0) != (b)->dimension(1),

343

"The product AB is defined only if the number of columns in A is equal to the number of rows in B");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

344

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");

345

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");

346

Radu Salavat

f1f1f87

2024-02-27 18:32:26 +0000

[diff] [blame^]

347

// When using accumulation(in place summation), for now, the only supported DataType for output is S32.

348

if (gemm_info.accumulate())

349

{

350

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE,

351

"Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED");

352

}

353

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

354

GEMMInfo info = gemm_info;

355

const ITensorInfo *matrix_a_info = a;

356

const ITensorInfo *matrix_b_info = b;

357

358

const ITensorInfo *a_to_use = a;

359

360

TensorInfo tmp_a_info{};

361

TensorInfo tmp_b_info{};

362

TensorInfo mm_result_s32_info{};

363

364

int32_t a_offset = a->quantization_info().uniform().offset;

365

int32_t b_offset = b->quantization_info().uniform().offset;

366

367

bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

368

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

369

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

370

auto_init_if_empty(mm_result_s32_info,

371

a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

372

}

373

374

// Convert QASYMM8->QASYMM8_SIGNED

375

TensorInfo signed_a{};

376

TensorInfo signed_output{};

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

377

bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&

378

(a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();

379

if (flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

380

{

381

const int32_t offset_correction = 128;

382

const DataType dt = DataType::QASYMM8_SIGNED;

383

const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();

384

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

385

signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(

386

QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

387

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));

388

a_to_use = &signed_a;

389

a_offset = signed_a.quantization_info().uniform().offset;

390

391

const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

392

signed_output = output->clone()->set_data_type(dt).set_quantization_info(

393

QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

394

395

// Output stage correction

396

GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();

397

output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;

398

output_stage_corr.gemmlowp_min_bound -= offset_correction;

399

output_stage_corr.gemmlowp_max_bound -= offset_correction;

400

info.set_gemmlowp_output_stage(output_stage_corr);

401

402

// Update matrix a

403

matrix_a_info = &signed_a;

404

}

405

406

// Initialize assembly kernel meta-data

407

const AsmGemmInfo asm_info = init_assembly_metadata(info);

408

409

// Check if we need to run the optimized assembly kernel

410

bool run_optimised = false;

411

bool run_optimised_requantized = false;

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

412

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

413

if (!(!b->are_values_constant() &&

414

b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

415

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

416

if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&

417

info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

418

{

419

run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));

420

run_optimised_requantized = run_optimised;

421

}

422

else

423

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

424

run_optimised = bool(CpuGemmAssemblyDispatch::validate(

425

a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));

Viet-Hoa Do

2023-04-03 16:27:25 +0100

[diff] [blame]

426

}

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

427

}

428

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

429

if (run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

430

{

431

ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

432

if (info.depth_output_gemm3d() != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

433

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

434

if (info.reinterpret_input_as_3d())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

435

{

436

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

437

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));

}

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));

}

}

else

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

451

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),

452

"NEGEMM cannot reinterpret the input tensor as 3D");

453

ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,

454

"NEGEMM cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

455

456

const bool run_vector_matrix_multiplication = a->dimension(1) < 2;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

457

if (!run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

458

{

459

matrix_a_info = &tmp_a_info;

460

matrix_b_info = &tmp_b_info;

461

462

// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]

463

TensorShape shape_tmp_a = a->tensor_shape();

464

shape_tmp_a.set(0, a->dimension(0) * 4);

465

shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));

466

467

// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]

468

TensorShape shape_tmp_b = b->tensor_shape();

469

shape_tmp_b.set(0, b->dimension(1) * 16);

470

shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));

471

472

// Validate interleave kernel

473

auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));

474

auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));

475

476

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));

477

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));

478

}

479

}

480

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

481

if (!run_optimised_requantized)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

482

{

483

TensorInfo info_vector_sum_col{};

484

TensorInfo info_vector_sum_row{};

485

486

const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);

487

488

// Validate matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

489

if (a_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

490

{

491

info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);

492

493

// Configure Matrix B reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

494

ARM_COMPUTE_RETURN_ON_ERROR(

495

kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

496

}

497

498

// Validate Matrix A reduction kernel only if _b_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

499

if (b_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

500

{

501

info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);

502

503

// Configure matrix A reduction kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

504

ARM_COMPUTE_RETURN_ON_ERROR(

505

kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

506

}

507

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

508

if (fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

509

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

510

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

511

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

512

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

513

info.reinterpret_input_as_3d(),

514

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

515

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

516

info.depth_output_gemm3d() != 0,

517

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

518

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

519

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(

520

matrix_a_info, matrix_b_info, &mm_result_s32_info));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

521

}

522

523

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

524

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(

525

&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,

526

b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,

527

b_offset, info.gemmlowp_output_stage()));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

528

}

529

else

530

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

531

if (!run_optimised)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

532

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

533

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

534

info.reinterpret_input_as_3d(),

535

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");

536

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

537

info.depth_output_gemm3d() != 0,

538

"CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

539

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

540

ARM_COMPUTE_RETURN_ON_ERROR(

541

kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

542

}

543

// Validate offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

544

ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(

545

output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,

546

a_offset, b_offset));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Validate activation

551

const ActivationLayerInfo &activation = gemm_info.activation_info();

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

552

if (activation.enabled())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

553

{

554

ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));

}

return Status{};

}

void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)

561

{

562

prepare(tensors);

Georgios Pinitas

22f5ed5

2021-07-23 18:58:43 +0100

[diff] [blame]

563

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

564

auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);

565

auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

566

auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);

567

auto dst = tensors.get_tensor(TensorType::ACL_DST);

auto a_to_use = a;

auto matrix_a = a;

auto matrix_b = b;

CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);

573

CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);

574

CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);

575

CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);

576

CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);

577

CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);

578

CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);

579

580

// Convert QASYMM8->QASYMM8_SIGNED

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

581

if (_flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

582

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

583

ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};

584

NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),

585

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

586

a_to_use = signed_a.get();

Georgios Pinitas

d4a5bc5

2021-08-12 07:42:51 +0100

[diff] [blame]

587

matrix_a = signed_a.get();

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

588

}

589

590

// Run GEMM

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

591

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

592

{

593

ITensorPack asm_glue_tensors = tensors;

594

auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

595

if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&

596

_gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

597

{

598

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

599

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

600

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);

601

asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);

}

else

{

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);

606

asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);

607

asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);

608

}

609

_asm_glue->run(asm_glue_tensors);

610

}

611

else

612

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

613

if (!_run_vector_matrix_multiplication)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

614

{

615

matrix_a = tmp_a.get();

616

matrix_b = tmp_b.get();

617

// Run interleave kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

618

ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};

619

NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),

620

pack_a);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

621

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

622

if (!_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

623

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

624

ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

625

// Run transpose kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

626

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,

627

_mtx_b_reshape_kernel->window(), pack_b);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

628

}

629

}

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

630

ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};

631

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

632

{

633

pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());

}

else

{

pack_mm.add_tensor(TensorType::ACL_DST, dst);

638

}

639

NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);

640

}

641

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

642

if (!_fused_assembly_path)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

643

{

644

// Run matrix A reduction kernel only if _b_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

645

if (_b_offset != 0)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

646

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

647

ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};

648

NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,

649

_mtx_a_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

650

}

651

652

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

653

if (_a_offset != 0 && !_reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

654

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

655

ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};

656

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

657

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

658

}

659

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

660

if (_fuse_output_stage)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

661

{

662

ITensorPack pack;

663

pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());

664

pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());

665

pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());

666

pack.add_tensor(TensorType::ACL_SRC_3, c);

667

pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);

668

669

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

670

NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,

671

_offset_contribution_output_stage_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

else

{

ITensorPack pack;

pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());

677

pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());

678

pack.add_tensor(TensorType::ACL_DST, dst);

679

680

// Run offset contribution kernel

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

681

NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,

682

_offset_contribution_kernel->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

}

}

// Convert QASYMM8_SIGNED->QASYMM8

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

687

if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

688

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

689

ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};

690

NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,

691

_convert_from_signed_asymm->window(), pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

692

}

693

694

// Run fused activation unless already run in the fused assembly

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

695

if (_run_activation)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

696

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

697

ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

698

_activation_func->run(pack);

}

}

void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)

703

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

704

if (!_is_prepared)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

705

{

706

auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);

707

// Run assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

708

if (_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

709

{

710

_asm_glue->prepare(tensors);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

711

}

712

// Run non-assembly reshape

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

713

else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

714

{

715

// Run reshape kernel and mark original weights tensor as unused

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

716

ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

717

CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

718

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};

719

NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),

720

pack);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

721

}

722

723

// Run matrix B reduction kernel only if _a_offset is not equal to 0

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

724

if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

725

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

726

ITensor *vector_sum_col_p =

727

utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

728

CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

729

ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};

730

NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,

731

_mtx_b_reduction_kernel->window(), pack);

Manuel Bottini