Blame - src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-12-07 16:47:52 +0000

[diff] [blame]

54

{

Georgios Pinitas

78c0090

2018-01-09 17:33:11 +0000

[diff] [blame]

55

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);

Vidhya Sudhan Loganathan

f1f4906

2018-05-25 13:21:26 +0100

[diff] [blame]

56

ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

57

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

58

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);

Vidhya Sudhan Loganathan

a25d16c

2018-11-16 11:33:12 +0000

[diff] [blame]

59

ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

60

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

61

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

62

ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

63

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

64

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

65

const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;

66

const bool has_vec_c = input2 != nullptr && beta != 0.f;

67

ARM_COMPUTE_RETURN_ERROR_ON_MSG(has_vec_c && !is_beta_one, "Adding input2 is only supported for beta equal to 1");

68

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

69

if(!is_interleaved_transposed)

70

{

71

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

if(has_vec_c)

{

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);

76

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");

77

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->dimension(0) != input1->dimension(0), "Length of Vector C must match the number of columns of matrix B");

78

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

79

}

80

else

81

{

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

82

GEMMRHSMatrixInfo rhs_info;

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

83

GEMMLHSMatrixInfo lhs_info;

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

84

const int m = reshape_info.m();

85

const int n = reshape_info.n();

86

const int k = reshape_info.k();

87

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

88

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

89

rhs_info.n0 = 16 / input1->element_size();

90

rhs_info.k0 = 1;

91

rhs_info.h0 = mult_transpose1xW_width;

92

rhs_info.interleave = false;

93

rhs_info.transpose = false;

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

94

lhs_info.m0 = 4;

95

lhs_info.k0 = 4;

96

lhs_info.v0 = mult_interleave4x4_height;

97

lhs_info.interleave = true;

98

lhs_info.transpose = true;

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

99

100

TensorShape tensor_shape0{ input0->tensor_shape() };

101

tensor_shape0.set(0, k);

102

tensor_shape0.set(1, m);

103

104

TensorShape tensor_shape1{ input1->tensor_shape() };

105

tensor_shape1.set(0, n);

106

tensor_shape1.set(1, k);

107

108

const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);

109

const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);

110

giuros01

2019-01-11 14:04:43 +0000

[diff] [blame]

111

const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));

giuros01

8b6b4a9

2018-12-18 19:01:33 +0000

[diff] [blame]

112

const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

113

114

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);

115

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

if(has_vec_c)

{

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);

120

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");

121

}

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

122

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

123

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

124

if(output->total_size() != 0)

125

{

126

const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));

127

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);

128

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

return Status{};

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

134

inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,

135

float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

136

ElementsProcessed &num_elements_processed)

137

{

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

138

ARM_COMPUTE_UNUSED(beta);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

139

bool window_changed = false;

140

Window win{};

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

141

Window win_out{};

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

142

143

const DataType data_type = input0->data_type();

144

unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];

145

unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

146

bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();

Gian Marco Iodice

3139f03

2018-11-05 14:26:32 +0000

[diff] [blame]

147

bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

148

const bool has_vec_c = input2 != nullptr && beta != 0.f;

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

149

150

// In case both input and output have to be reinterpreted as 3D tensors,

151

// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.

152

if(reinterpret_input_as_3d == reinterpret_output_as_3d)

153

{

154

reinterpret_input_as_3d = false;

155

reinterpret_output_as_3d = false;

156

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

157

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

158

// Output tensor auto inizialitation if not yet initialized

Isabella Gottardi

c4f582e

2018-10-11 19:14:55 +0100

[diff] [blame]

159

auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

160

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

161

TensorInfo tmp_info(*output);

162

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

163

if(reinterpret_output_as_3d)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

164

{

165

// Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,

166

// the window needs to be constructed on the 2D collapsed version of the tensor

167

TensorShape tmp_shape(output->tensor_shape());

168

tmp_shape.collapse(2U, 1U);

169

tmp_info.set_tensor_shape(tmp_shape);

170

}

171

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

172

if(is_interleaved_transposed)

173

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

174

// reinterpret_input_as_3d is not supported if is_interleaved_transposed is set

Isabella Gottardi

c4f582e

2018-10-11 19:14:55 +0100

[diff] [blame]

175

ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

176

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

177

// Configure kernel window

178

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

179

num_elems_processed_per_iteration_y = 4;

180

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

181

// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor

182

// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic

183

const int m = reshape_info.m();

184

const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;

185

186

win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

187

win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

188

Michele Di Giorgio

17a01a3

2019-01-03 15:12:27 +0000

[diff] [blame]

189

AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));

190

AccessWindowStatic input1_access(input1, 0, 0,

191

ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),

192

ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

193

AccessWindowStatic output_access(output, 0, 0,

194

ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),

195

output->dimension(1) + bottom_pad);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

196

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

197

window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop

198

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

199

if(has_vec_c)

200

{

201

AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);

202

window_changed = window_changed || update_window_and_padding(win, input2_access);

203

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

204

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

205

output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

206

}

207

else // The input tensors have not been reshaped

208

{

209

// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.

210

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

211

num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);

212

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

213

// Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor

214

// The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

215

const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

216

const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;

217

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

218

// Create kernels according to the architecture, data type and input size.

Michalis Spyrou

a967611

2018-02-22 18:07:43 +0000

[diff] [blame]

219

GPUTarget arch_target = get_arch_from_target(gpu_target);

220

if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

221

{

Gian Marco

1d25ed5

2017-12-16 19:33:50 +0000

[diff] [blame]

222

num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

223

}

224

225

// Configure window

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

226

win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

227

win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

228

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

229

AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

230

AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));

231

AccessWindowStatic output_access(output, 0, 0,

232

ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),

233

output->dimension(1) + bottom_pad);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

234

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

235

window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop

236

update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

237

if(has_vec_c)

238

{

239

AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);

240

window_changed = window_changed || update_window_and_padding(win, input2_access);

241

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

242

243

Coordinates coord;

244

coord.set_num_dimensions(output->num_dimensions());

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

245

output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

246

}

247

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

248

// Collapse along the Z direction

249

// This collapse needs to be here in order to tune the Z dimension of LWS

Gian Marco Iodice

81b28c4

2018-03-29 10:29:36 +0100

[diff] [blame]

250

Window collapsed = win;

251

const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);

252

collapsed = win.collapse(win, dimension_to_collapse);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

253

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

254

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

255

return std::make_pair(err, collapsed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

} // namespace

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

259

CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

260

: _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _has_vec_c(false)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

264

void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,

265

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

266

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

267

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

268

269

// Perform validate step

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

270

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,

271

is_interleaved_transposed, reshape_info, fp_mixed_precision));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

272

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

273

_input0 = input0;

274

_input1 = input1;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

275

_input2 = input2;

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

276

_output = output;

277

_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();

Gian Marco Iodice

3139f03

2018-11-05 14:26:32 +0000

[diff] [blame]

278

_reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

279

Gian Marco Iodice

2018-08-06 14:31:15 +0100

[diff] [blame]

280

// In case both input and output have to be reinterpreted as 3D tensors,

281

// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.

282

if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)

283

{

284

_reinterpret_input_as_3d = false;

285

_reinterpret_output_as_3d = false;

286

}

287

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

288

// Check if we need to slide the matrix B

289

const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();

290

291

_slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

292

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

293

const DataType data_type = input0->info()->data_type();

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

294

295

// Get target architecture

296

GPUTarget gpu_target = get_target();

297

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

298

ElementsProcessed num_elements_processed{};

299

300

// Configure kernel window

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

301

auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,

302

gpu_target, num_elements_processed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

303

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

304

ICLKernel::configure_internal(win_config.second);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

305

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

306

// Create build options

307

CLBuildOptions build_opts;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

308

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

309

// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.

Gian Marco Iodice

82d9dd1

2019-06-10 16:45:40 +0100

[diff] [blame]

310

if(!(helpers::float_ops::is_one(alpha)))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

311

{

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

312

build_opts.add_option("-DALPHA=" + float_to_string_with_full_precision(alpha));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

}

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

314

build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");

315

build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");

316

build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));

317

build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

318

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

319

// Do not slide matrix B if _slide_matrix_b = false

320

build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));

321

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

322

const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;

323

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

324

_has_vec_c = input2 != nullptr && beta != 0.f;

325

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

326

std::string kernel_name;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

327

if(is_interleaved_transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

328

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

329

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

330

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

331

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

332

build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

333

build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));

334

build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));

335

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

336

if(is_data_type_float(data_type) && is_bifrost)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

337

{

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

338

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

339

}

340

else

341

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

342

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));

Vidhya Sudhan Loganathan

38d93bd

2018-11-20 15:38:13 +0000

[diff] [blame]

343

if(fp_mixed_precision && data_type == DataType::F16)

344

{

345

// currently wider accumulator is only supported for fp16 kernels.

346

kernel_name += "_acc32";

347

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

348

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

349

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

350

else // The input tensors have not been reshaped

351

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

352

build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

353

build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

354

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

355

// Create kernels according to the architecture, data type and input size.

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

356

if(is_data_type_float(data_type) && is_bifrost)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

357

{

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

358

kernel_name = "gemm_mm_floating_point";

359

360

if(input0->info()->num_dimensions() != 1)

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

361

{

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

362

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

Vidhya Sudhan Loganathan

38d93bd

2018-11-20 15:38:13 +0000

[diff] [blame]

363

if(fp_mixed_precision && data_type == DataType::F16)

364

{

365

// currently wider accumulator is only supported for fp16 kernels.

366

kernel_name += "_acc32";

367

}

Gian Marco Iodice

2018-04-11 15:59:10 +0100

[diff] [blame]

368

}

369

else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)

370

{

371

// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and

372

// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.

373

// FC6 and FC7 of AlexNet and VGG-16).

374

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

375

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

376

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

377

// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels

378

// via exhaustive autotuning over a range of representative layer configurations.

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

379

set_lws_hint(cl::NDRange(4));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

380

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

381

else // (MIDGARD and F32) or (F16)

382

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

383

kernel_name = "gemm_mm_floating_point";

384

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

385

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));

386

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

387

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

388

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

389

// Configure matrix C addition if necessary

390

build_opts.add_option_if(_has_vec_c, "-DADD_VEC_C");

391

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

392

// Create kernel

393

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));

394

395

// Set config_id for enabling LWS tuning

396

_config_id = "gemm_";

397

_config_id += (is_interleaved_transposed ? "reshaped_" : "");

Vidhya Sudhan Loganathan

a25d16c

2018-11-16 11:33:12 +0000

[diff] [blame]

398

_config_id += (fp_mixed_precision ? "fp_mixed_" : "");

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

399

_config_id += (_reinterpret_input_as_3d ? "3di_" : "");

400

_config_id += (_reinterpret_output_as_3d ? "3do_" : "");

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

401

_config_id += lower_string(string_from_data_type(input0->info()->data_type()));

402

_config_id += "_";

403

_config_id += support::cpp11::to_string(output->info()->dimension(1));

404

_config_id += "_";

405

_config_id += support::cpp11::to_string(output->info()->dimension(0));

406

_config_id += "_";

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

407

_config_id += support::cpp11::to_string(output->info()->dimension(2));

408

_config_id += "_";

409

_config_id += support::cpp11::to_string(output->info()->dimension(3));

410

_config_id += "_";

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

411

_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

412

}

413

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

414

Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,

415

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

416

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

417

// Note: num_elements_processed will be set in validate_and_configure_window()

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

418

ElementsProcessed num_elements_processed{};

419

ARM_COMPUTE_UNUSED(alpha);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

420

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

421

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),

422

input1->clone().get(),

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

423

(input2 != nullptr) ? input2->clone().get() : nullptr,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

424

output->clone().get(),

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

425

beta,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

426

is_interleaved_transposed,

Gian Marco Iodice

750641d

2018-05-08 12:01:57 +0100

[diff] [blame]

427

reshape_info,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

428

gpu_target,

429

num_elements_processed)

.first);

return Status{};

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

435

void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

436

{

437

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

438

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

439

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

440

if(_input1->info()->num_dimensions() < 3)

441

{

442

// The stride_z for matrix B must be zero if we do not slice

443

ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);

444

}

445

446

Window slice = window.first_slice_window_3D();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

447

Window slice_matrix_b = slice;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

448

449

slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));

450

slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

451

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

452

const unsigned int num_arguments_vec_c = (_has_vec_c) ? num_arguments_per_1D_tensor() : 0;

453

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

454

if(_reinterpret_input_as_3d)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

455

{

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

456

// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

457

const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_vec_c;

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

458

const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;

459

_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));

460

}

461

462

if(_reinterpret_output_as_3d)

463

{

464

// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

465

const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_vec_c;

Georgios Pinitas

e8bd2c7

2018-07-11 15:54:56 +0100

[diff] [blame]

466

const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;

467

_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

468

}

469

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

470

do

471

{

472

Window slice_b = slice;

473

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

474

// This scenario can happen when the matrix multiplication is used to perform a convolution operation

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

475

if(!_slide_matrix_b)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

476

{

477

slice_b = slice_matrix_b;

478

}

479

480

unsigned int idx = 0;

481

add_2D_tensor_argument(idx, _input0, slice);

482

add_2D_tensor_argument(idx, _input1, slice_b);

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

483

if(_has_vec_c)

484

{

485

add_1D_tensor_argument(idx, _input2, slice);

486

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

487

add_2D_tensor_argument(idx, _output, slice);

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

488

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));

489

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));

490

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));

Anthony Barbier

b6eb353

2018-08-08 13:20:04 +0100

[diff] [blame]

491

enqueue(queue, *this, slice, lws_hint());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

492

}

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

493

while(window.slide_window_slice_3D(slice));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

494

}

giuros01