Blame - src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp - ml/ComputeLibrary

Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,

56

const GEMMRHSMatrixInfo &rhs_info,

57

const GEMMKernelInfo &gemm_info)

58

{

59

ARM_COMPUTE_UNUSED(alpha);

60

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);

61

ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);

62

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);

63

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);

64

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");

65

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");

66

ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);

67

ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);

68

ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");

69

ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);

70

ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);

71

ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");

72

ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");

73

ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)

74

&& (!gemm_info.broadcast_bias),

75

"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");

76

ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");

77

ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));

78

79

const unsigned int m = gemm_info.m;

80

const unsigned int n = gemm_info.n;

81

const unsigned int k = gemm_info.k;

82

83

TensorShape tensor_shape0{ src0->tensor_shape() };

84

tensor_shape0.set(0, k);

85

tensor_shape0.set(1, m);

86

87

TensorShape tensor_shape1{ src1->tensor_shape() };

88

tensor_shape1.set(0, n);

89

tensor_shape1.set(1, k);

90

91

if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))

92

{

93

const unsigned int src2_dim0 = src2->dimension(0);

94

const unsigned int src2_dim1 = src2->dimension(1);

95

96

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);

97

if(gemm_info.broadcast_bias)

98

{

99

ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");

}

}

const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);

108

const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);

109

110

const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));

111

const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));

112

113

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);

114

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);

115

116

if(dst->total_size() != 0)

117

{

118

const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));

119

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);

120

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);

}

return Status{};

}

std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,

127

const GEMMRHSMatrixInfo &rhs_info,

128

const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)

129

{

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

130

ARM_COMPUTE_UNUSED(src0, src1, src2);

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

131

unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];

132

unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];

133

bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;

134

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

135

TensorInfo tmp_info(*dst);

136

137

if(reinterpret_output_as_3d)

138

{

139

// Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,

140

// the window needs to be constructed on the 2D collapsed version of the tensor

141

TensorShape tmp_shape(dst->tensor_shape());

142

tmp_shape.collapse(2U, 1U);

143

tmp_info.set_tensor_shape(tmp_shape);

144

}

145

146

// Configure kernel window

147

num_elems_processed_per_iteration_x = rhs_info.n0;

148

num_elems_processed_per_iteration_y = lhs_info.m0;

149

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

150

Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

151

152

// Collapse along the Z direction

153

// This collapse needs to be here in order to tune the Z dimension of LWS

154

Window collapsed = win;

155

const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);

156

collapsed = win.collapse(win, dimension_to_collapse);

157

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

158

return std::make_pair(Status{}, collapsed);

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

}

} // namespace

Giorgio Arena

2021-06-28 11:00:27 +0100

[diff] [blame]

162

ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()

163

{

164

_type = CLKernelType::GEMM;

165

}

166

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

167

void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

168

const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

169

const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)

170

{

171

ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);

172

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

173

// dst tensor auto initialization if not yet initialized

174

auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));

175

SiCongLi

eb8bd81

2021-10-29 15:05:49 +0100

[diff] [blame]

176

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));

177

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

178

auto padding_info = get_padding_info({ src0, src1, src2, dst });

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

179

_reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;

180

_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());

181

_add_bias = src2 != nullptr;

182

_export_to_cl_image = rhs_info.export_to_cl_image;

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

183

184

// Check if we need to slide the matrix B

185

const unsigned int num_dimensions_src0 = src0->num_dimensions();

186

_slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);

187

188

ElementsProcessed num_elements_processed{};

189

190

// Configure kernel window

Giorgio Arena

bde2f35

2021-09-07 14:15:28 +0100

[diff] [blame]

191

auto win_config = validate_and_configure_window(src0->clone().get(),

192

src1->clone().get(),

193

(src2 != nullptr) ? src2->clone().get() : nullptr,

dst->clone().get(),

lhs_info,

rhs_info,

gemm_info,

num_elements_processed);

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

199

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

200

ICLKernel::configure_internal(win_config.second);

201

202

const bool enable_mixed_precision = gemm_info.fp_mixed_precision;

203

const DataType data_type = src0->data_type();

204

205

// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.

206

const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);

207

208

const unsigned int partial_store_m0 = internal_m % lhs_info.m0;

209

const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

210

_m = gemm_info.m;

211

_n = gemm_info.n;

212

_k = gemm_info.k;

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

213

214

// Create build options

215

CLBuildOptions build_opts;

216

build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));

217

build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));

218

build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");

219

build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");

220

build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));

221

build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));

222

build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");

223

build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));

224

build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");

225

build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");

226

build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");

227

build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

228

build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");

229

build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");

230

build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));

231

build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));

232

build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

233

build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));

234

build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));

235

build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));

236

build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));

237

build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));

238

build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));

239

build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));

Jakub Sujak

0d27b2e

2023-08-24 14:01:20 +0100

[diff] [blame^]

240

build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));

241

build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));

242

build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

243

244

std::string kernel_name("gemm_mm_reshaped_");

245

kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";

246

kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";

247

kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";

248

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

249

// A macro guard to compile ONLY the kernel of interest

250

build_opts.add_option("-D" + upper_string(kernel_name));

251

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

252

// Create kernel

253

_kernel = create_kernel(compile_context, kernel_name, build_opts.options());

254

255

// Set config_id for enabling LWS tuning

256

_config_id = kernel_name;

257

_config_id += "_";

258

_config_id += (_add_bias ? "add_bias_" : "");

259

_config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");

260

_config_id += (_reinterpret_output_as_3d ? "3do_" : "");

261

_config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");

262

_config_id += lower_string(string_from_data_type(src0->data_type()));

263

_config_id += "_";

264

_config_id += (enable_mixed_precision ? "mixed_precision_" : "");

265

_config_id += support::cpp11::to_string(dst->dimension(1));

266

_config_id += "_";

267

_config_id += support::cpp11::to_string(dst->dimension(0));

268

_config_id += "_";

269

_config_id += support::cpp11::to_string(gemm_info.k);

270

_config_id += "_";

271

_config_id += support::cpp11::to_string(dst->dimension(2));

272

_config_id += "_";

273

_config_id += support::cpp11::to_string(lhs_info.m0);

274

_config_id += "_";

275

_config_id += support::cpp11::to_string(rhs_info.n0);

276

_config_id += "_";

277

_config_id += support::cpp11::to_string(lhs_info.k0);

278

_config_id += "_";

279

_config_id += support::cpp11::to_string(lhs_info.v0);

280

_config_id += "_";

281

_config_id += support::cpp11::to_string(rhs_info.h0);

282

_config_id += "_";

283

_config_id += support::cpp11::to_string(lhs_info.interleave);

284

_config_id += "_";

285

_config_id += support::cpp11::to_string(rhs_info.interleave);

286

287

ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));

288

}

289

290

Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,

291

const GEMMLHSMatrixInfo &lhs_info,

292

const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)

293

{

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

294

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

return Status{};

}

void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)

299

{

300

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

301

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

302

303

const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));

304

const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));

305

const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));

306

auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));

307

308

ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);

309

ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);

310

311

if(src1->info()->num_dimensions() < 3)

312

{

313

// The stride_z for matrix B must be zero if we do not slice

314

ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);

315

}

316

317

Window slice = window.first_slice_window_3D();

318

Window slice_matrix_b = slice;

319

320

slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));

321

slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));

322

323

const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;

324

325

cl::Image2D src1_image2d;

326

327

if(_export_to_cl_image)

328

{

329

const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));

330

const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];

331

Gian Marco Iodice

3cce35d

2022-12-30 16:07:45 +0000

[diff] [blame]

332

src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);

Georgios Pinitas

856f66e

2021-04-22 21:13:21 +0100

[diff] [blame]

}

do

{