Blame - src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-12-07 16:47:52 +0000

[diff] [blame]

55

{

Georgios Pinitas

78c0090

2018-01-09 17:33:11 +0000

[diff] [blame]

56

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);

Vidhya Sudhan Loganathan

f1f4906

2018-05-25 13:21:26 +0100

[diff] [blame]

57

ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

58

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

59

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);

Vidhya Sudhan Loganathan

a25d16c

2018-11-16 11:33:12 +0000

[diff] [blame]

60

ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

61

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

62

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

63

ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

64

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");

Gian Marco Iodice

b238f5f

2019-08-02 09:09:53 +0100

[diff] [blame]

65

ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (input2 != nullptr)

Matthew Bentham

758b5ba

2020-03-05 23:37:48 +0000

[diff] [blame]

66

&& (!reshape_info.broadcast_bias()),

67

"Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

68

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

69

if(!is_interleaved_transposed)

70

{

71

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

72

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

73

if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

74

{

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

75

const unsigned int m = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1);

76

const unsigned int n = input1->dimension(0);

77

const unsigned int input2_dim0 = input2->dimension(0);

78

const unsigned int input2_dim1 = input2->dimension(1);

79

80

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);

81

if(reshape_info.broadcast_bias())

82

{

83

ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");

88

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

89

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

90

}

91

else

92

{

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

93

GEMMRHSMatrixInfo rhs_info;

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

94

GEMMLHSMatrixInfo lhs_info;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

95

const auto m = static_cast<unsigned int>(reshape_info.m());

96

const auto n = static_cast<unsigned int>(reshape_info.n());

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

97

const int k = reshape_info.k();

98

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

99

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

100

rhs_info.n0 = 16 / input1->element_size();

101

rhs_info.k0 = 1;

102

rhs_info.h0 = mult_transpose1xW_width;

103

rhs_info.interleave = false;

104

rhs_info.transpose = false;

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

105

lhs_info.m0 = 4;

106

lhs_info.k0 = 4;

107

lhs_info.v0 = mult_interleave4x4_height;

108

lhs_info.interleave = true;

109

lhs_info.transpose = true;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

110

111

TensorShape tensor_shape0{ input0->tensor_shape() };

112

tensor_shape0.set(0, k);

113

tensor_shape0.set(1, m);

114

115

TensorShape tensor_shape1{ input1->tensor_shape() };

116

tensor_shape1.set(0, n);

117

tensor_shape1.set(1, k);

118

119

const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);

120

const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);

121

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

122

const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

123

const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

124

125

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);

126

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

127

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

128

if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

129

{

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

130

const unsigned int input2_dim0 = input2->dimension(0);

131

const unsigned int input2_dim1 = input2->dimension(1);

132

133

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);

134

if(reshape_info.broadcast_bias())

135

{

136

ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");

141

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

142

}

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

143

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

144

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

145

if(output->total_size() != 0)

146

{

147

const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));

148

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);

149

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

return Status{};

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

155

inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,

156

float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

157

ElementsProcessed &num_elements_processed)

158

{

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

159

ARM_COMPUTE_UNUSED(beta);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

160

bool window_changed = false;

161

Window win{};

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

162

Window win_out{};

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

163

164

const DataType data_type = input0->data_type();

165

unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];

166

unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

167

bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();

Gian Marco Iodice

3139f03

2018-11-05 14:26:32 +0000

[diff] [blame]

168

bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

169

170

// In case both input and output have to be reinterpreted as 3D tensors,

171

// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.

172

if(reinterpret_input_as_3d == reinterpret_output_as_3d)

173

{

174

reinterpret_input_as_3d = false;

175

reinterpret_output_as_3d = false;

176

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

177

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

178

// Output tensor auto inizialitation if not yet initialized

Isabella Gottardi

c4f582e

2018-10-11 19:14:55 +0100

[diff] [blame]

179

auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

180

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

181

TensorInfo tmp_info(*output);

182

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

183

if(reinterpret_output_as_3d)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

184

{

185

// Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,

186

// the window needs to be constructed on the 2D collapsed version of the tensor

187

TensorShape tmp_shape(output->tensor_shape());

188

tmp_shape.collapse(2U, 1U);

189

tmp_info.set_tensor_shape(tmp_shape);

190

}

191

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

192

if(is_interleaved_transposed)

193

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

194

// reinterpret_input_as_3d is not supported if is_interleaved_transposed is set

Isabella Gottardi

c4f582e

2018-10-11 19:14:55 +0100

[diff] [blame]

195

ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

196

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

197

// Configure kernel window

198

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

199

num_elems_processed_per_iteration_y = 4;

200

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

201

// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor

202

// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic

203

const int m = reshape_info.m();

204

const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;

205

206

win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

207

win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

208

Michele Di Giorgio

17a01a3

2019-01-03 15:12:27 +0000

[diff] [blame]

209

AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));

210

AccessWindowStatic input1_access(input1, 0, 0,

211

ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),

212

ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

213

AccessWindowStatic output_access(output, 0, 0,

214

ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),

215

output->dimension(1) + bottom_pad);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

216

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

217

if(input2 != nullptr)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

218

{

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

219

const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;

220

221

const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;

222

223

AccessWindowStatic input2_access(input2, 0, 0,

224

ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),

225

ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));

226

227

window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop

228

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

}

else

{

window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop

233

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

234

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

235

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

236

output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

237

}

238

else // The input tensors have not been reshaped

239

{

240

// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.

241

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

242

num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);

243

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

244

// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor

245

// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

246

const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

247

const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;

248

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

249

// Create kernels according to the architecture, data type and input size.

Michalis Spyrou

a967611

2018-02-22 18:07:43 +0000

[diff] [blame]

250

GPUTarget arch_target = get_arch_from_target(gpu_target);

251

if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

252

{

Gian Marco

1d25ed5

2017-12-16 19:33:50 +0000

[diff] [blame]

253

num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

254

}

255

256

// Configure window

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

257

win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

258

win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

259

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

260

AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

261

AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));

262

AccessWindowStatic output_access(output, 0, 0,

263

ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),

264

output->dimension(1) + bottom_pad);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

265

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

266

if(input2 != nullptr)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

267

{

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

268

const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;

269

270

const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;

271

272

AccessWindowStatic input2_access(input2, 0, 0,

273

ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),

274

ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));

275

276

window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop

277

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

}

else

{

window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop

282

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

283

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

284

285

Coordinates coord;

286

coord.set_num_dimensions(output->num_dimensions());

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

287

output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

288

}

289

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

290

// Collapse along the Z direction

291

// This collapse needs to be here in order to tune the Z dimension of LWS

Gian Marco Iodice

81b28c4

2018-03-29 10:29:36 +0100

[diff] [blame]

292

Window collapsed = win;

293

const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);

294

collapsed = win.collapse(win, dimension_to_collapse);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

295

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

296

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

297

return std::make_pair(err, collapsed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

} // namespace

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

301

CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

302

: _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),

303

_broadcast_bias(false)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

307

void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

308

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

309

{

Manuel Bottini

4c6bd51

2020-04-08 10:15:51 +0100

[diff] [blame]

310

configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);

311

}

312

Manuel Bottini

679fc96

2020-04-21 16:08:53 +0100

[diff] [blame]

313

void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,

Manuel Bottini

4c6bd51

2020-04-08 10:15:51 +0100

[diff] [blame]

314

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)

315

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

316

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

317

318

// Perform validate step

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

319

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,

320

is_interleaved_transposed, reshape_info, fp_mixed_precision));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

321

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

322

_input0 = input0;

323

_input1 = input1;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

324

_input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

325

_output = output;

326

_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();

Gian Marco Iodice

3139f03

2018-11-05 14:26:32 +0000

[diff] [blame]

327

_reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

328

_add_bias = _input2 != nullptr;

329

_broadcast_bias = reshape_info.broadcast_bias();

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

330

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

331

// In case both input and output have to be reinterpreted as 3D tensors,

332

// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.

333

if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)

334

{

335

_reinterpret_input_as_3d = false;

336

_reinterpret_output_as_3d = false;

337

}

338

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

339

// Check if we need to slide the matrix B

340

const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();

341

342

_slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

343

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

344

const DataType data_type = input0->info()->data_type();

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

345

346

// Get target architecture

347

GPUTarget gpu_target = get_target();

348

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

349

ElementsProcessed num_elements_processed{};

350

351

// Configure kernel window

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

352

auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,

353

gpu_target, num_elements_processed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

354

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

355

ICLKernel::configure_internal(win_config.second);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

356

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

357

// Create build options

358

CLBuildOptions build_opts;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

359

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

360

build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));

361

build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));

362

build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");

363

build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

364

build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");

365

build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");

366

build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));

367

build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

368

build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

369

build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));

370

build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));

371

build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

372

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

373

const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;

374

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

375

std::string kernel_name;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

376

if(is_interleaved_transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

377

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

378

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

379

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

380

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

381

build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

382

build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));

383

build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));

384

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

385

if(is_data_type_float(data_type) && is_bifrost)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

386

{

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

387

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

388

}

389

else

390

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

391

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));

Vidhya Sudhan Loganathan

38d93bd

2018-11-20 15:38:13 +0000

[diff] [blame]

392

if(fp_mixed_precision && data_type == DataType::F16)

393

{

394

// currently wider accumulator is only supported for fp16 kernels.

395

kernel_name += "_acc32";

396

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

397

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

398

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

399

else // The input tensors have not been reshaped

400

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

401

build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

402

build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

403

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

404

// Create kernels according to the architecture, data type and input size.

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

405

if(is_data_type_float(data_type) && is_bifrost)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

406

{

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

407

kernel_name = "gemm_mm_floating_point";

408

409

if(input0->info()->num_dimensions() != 1)

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

410

{

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

411

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

Vidhya Sudhan Loganathan

38d93bd

2018-11-20 15:38:13 +0000

[diff] [blame]

412

if(fp_mixed_precision && data_type == DataType::F16)

413

{

414

// currently wider accumulator is only supported for fp16 kernels.

415

kernel_name += "_acc32";

416

}

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

417

}

418

else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)

419

{

420

// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and

421

// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.

422

// FC6 and FC7 of AlexNet and VGG-16).

423

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

424

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

425

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

426

// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels

427

// via exhaustive autotuning over a range of representative layer configurations.

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

428

set_lws_hint(cl::NDRange(4));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

429

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

430

else // (MIDGARD and F32) or (F16)

431

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

432

kernel_name = "gemm_mm_floating_point";

433

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

434

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));

435

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

436

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

437

438

// Create kernel

Manuel Bottini

4c6bd51

2020-04-08 10:15:51 +0100

[diff] [blame]

439

_kernel = create_kernel(compile_context, kernel_name, build_opts.options());

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

440

441

// Set config_id for enabling LWS tuning

442

_config_id = "gemm_";

443

_config_id += (is_interleaved_transposed ? "reshaped_" : "");

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

444

_config_id += (_add_bias ? "add_bias_" : "");

445

_config_id += (_broadcast_bias ? "broadcast_bias_" : "");

Vidhya Sudhan Loganathan

a25d16c

2018-11-16 11:33:12 +0000

[diff] [blame]

446

_config_id += (fp_mixed_precision ? "fp_mixed_" : "");

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

447

_config_id += (_reinterpret_input_as_3d ? "3di_" : "");

448

_config_id += (_reinterpret_output_as_3d ? "3do_" : "");

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

449

_config_id += lower_string(string_from_data_type(input0->info()->data_type()));

450

_config_id += "_";

451

_config_id += support::cpp11::to_string(output->info()->dimension(1));

452

_config_id += "_";

453

_config_id += support::cpp11::to_string(output->info()->dimension(0));

454

_config_id += "_";

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

455

_config_id += support::cpp11::to_string(output->info()->dimension(2));

456

_config_id += "_";

457

_config_id += support::cpp11::to_string(output->info()->dimension(3));

458

_config_id += "_";

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

459

_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

460

}

461

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

462

Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

463

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

464

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

465

// Note: num_elements_processed will be set in validate_and_configure_window()

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

466

ElementsProcessed num_elements_processed{};

467

ARM_COMPUTE_UNUSED(alpha);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

468

ARM_COMPUTE_UNUSED(activation_info);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

469

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

470

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),

471

input1->clone().get(),

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

472

(input2 != nullptr) ? input2->clone().get() : nullptr,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

473

output->clone().get(),

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

474

beta,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

475

is_interleaved_transposed,

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

476

reshape_info,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

477

gpu_target,

478

num_elements_processed)

.first);

return Status{};

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

484

void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

485

{

486

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

487

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

488

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

489

if(_input1->info()->num_dimensions() < 3)

490

{

491

// The stride_z for matrix B must be zero if we do not slice

492

ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);

493

}

494

495

Window slice = window.first_slice_window_3D();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

496

Window slice_matrix_b = slice;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

497

498

slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));

499

slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

500

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

501

const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

502

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

503

if(_reinterpret_input_as_3d)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

504

{

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

505

// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

506

const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

507

const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;

508

_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));

509

}

510

511

if(_reinterpret_output_as_3d)

512

{

513

// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

514

const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;

Georgios Pinitas

e8bd2c7

2018-07-11 15:54:56 +0100

[diff] [blame]

515

const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;

516

_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

517

}

518

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

519

do

520

{

521

Window slice_b = slice;

522

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

523

// This scenario can happen when the matrix multiplication is used to perform a convolution operation

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

524

if(!_slide_matrix_b)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

525

{

526

slice_b = slice_matrix_b;

527

}

528

529

unsigned int idx = 0;

530

add_2D_tensor_argument(idx, _input0, slice);

531

add_2D_tensor_argument(idx, _input1, slice_b);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

532

if(_add_bias)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

533

{

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

534

add_2D_tensor_argument(idx, _input2, slice);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

535

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

536

add_2D_tensor_argument(idx, _output, slice);

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

537

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));

538

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

539

if(_add_bias)

540

{

541

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));

542

}

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

543

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

544

enqueue(queue, *this, slice, lws_hint());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

545

}

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

546

while(window.slide_window_slice_3D(slice));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

547

}

giuros01