Blame - src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-12-07 16:47:52 +0000

[diff] [blame]

53

{

Georgios Pinitas

78c0090

2018-01-09 17:33:11 +0000

[diff] [blame]

54

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);

Vidhya Sudhan Loganathan

f1f4906

2018-05-25 13:21:26 +0100

[diff] [blame^]

55

ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

56

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

57

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);

58

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

59

ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

60

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

61

if(!is_interleaved_transposed)

62

{

63

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

64

65

if(output->total_size() != 0)

66

{

67

ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));

68

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));

69

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

70

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);

}

}

else

{

const int m = reshape_info.m();

76

const int n = reshape_info.n();

77

const int k = reshape_info.k();

78

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

79

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

80

81

TensorShape tensor_shape0{ input0->tensor_shape() };

82

tensor_shape0.set(0, k);

83

tensor_shape0.set(1, m);

84

85

TensorShape tensor_shape1{ input1->tensor_shape() };

86

tensor_shape1.set(0, n);

87

tensor_shape1.set(1, k);

88

89

const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);

90

const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);

91

92

const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));

93

const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));

94

95

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);

96

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

97

98

if(output->total_size() != 0)

99

{

100

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));

101

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));

102

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

103

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);

104

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

return Status{};

}

inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,

Gian Marco Iodice

2018-05-08 12:01:57 +0100

[diff] [blame]

111

bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

112

ElementsProcessed &num_elements_processed)

113

{

114

bool window_changed = false;

115

Window win{};

116

117

const DataType data_type = input0->data_type();

118

unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];

119

unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];

120

Gian Marco Iodice

2018-05-08 12:01:57 +0100

[diff] [blame]

121

// Output tensor auto inizialitation if not yet initialized

122

auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));

123

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

124

if(is_interleaved_transposed)

125

{

126

// Configure kernel window

127

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

128

num_elems_processed_per_iteration_y = 4;

129

130

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

131

132

AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);

Georgios Pinitas

535fedd

2018-05-04 18:52:25 +0100

[diff] [blame]

133

AccessWindowStatic input1_access(input1, 0, 0,

134

ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),

135

ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

136

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

137

138

window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);

139

140

output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));

141

}

142

else // The input tensors have not been reshaped

143

{

144

// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.

145

num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);

146

num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);

147

148

// Create kernels according to the architecture, data type and input size.

Michalis Spyrou

2018-02-22 18:07:43 +0000

[diff] [blame]

149

GPUTarget arch_target = get_arch_from_target(gpu_target);

150

if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

151

{

Gian Marco

1d25ed5

2017-12-16 19:33:50 +0000

[diff] [blame]

152

num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

// Configure window

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

157

158

AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));

159

AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));

160

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

161

162

window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);

163

164

Coordinates coord;

165

coord.set_num_dimensions(output->num_dimensions());

166

output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));

167

}

168

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

169

// Collapse along the Z direction

170

// This collapse needs to be here in order to tune the Z dimension of LWS

Gian Marco Iodice

81b28c4

2018-03-29 10:29:36 +0100

[diff] [blame]

171

Window collapsed = win;

172

const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);

173

collapsed = win.collapse(win, dimension_to_collapse);

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

174

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

175

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

176

return std::make_pair(err, collapsed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

}

} // namespace

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

180

CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

181

: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

185

void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

186

{

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

187

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

188

189

// Perform validate step

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

190

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

191

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

_input0 = input0;

_input1 = input1;

_output = output;

_slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

196

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

197

const DataType data_type = input0->info()->data_type();

198

const int fp_pos = input0->info()->fixed_point_position();

199

200

// Get target architecture

Michalis Spyrou

2018-02-22 18:07:43 +0000

[diff] [blame]

201

GPUTarget gpu_target = get_target();

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

202

203

// Configure LWS hint

Michalis Spyrou

2018-02-22 18:07:43 +0000

[diff] [blame]

204

switch(gpu_target)

Anthony Barbier

fcd52fb

2017-11-28 10:31:43 +0000

[diff] [blame]

205

{

Michalis Spyrou

2018-02-22 18:07:43 +0000

[diff] [blame]

206

case GPUTarget::MIDGARD:

207

case GPUTarget::T600:

208

case GPUTarget::T700:

209

case GPUTarget::T800:

210

if(output->info()->dimension(1) == 196)

211

{

212

_lws_hint = cl::NDRange(1, 7);

}

else

{

_lws_hint = cl::NDRange(8, 8);

}

break;

case GPUTarget::G71:

case GPUTarget::G72:

Sam Laynton

56e8e86

2018-04-05 13:26:08 +0100

[diff] [blame]

221

case GPUTarget::G51:

222

case GPUTarget::G51BIG:

223

case GPUTarget::G51LIT:

224

case GPUTarget::TNOX:

Michalis Spyrou

2018-02-22 18:07:43 +0000

[diff] [blame]

225

if(input1->info()->dimension(1) == 24)

226

{

227

// LWS optimized for the 11x11 AlexNet convolution on Bifrost.

228

_lws_hint = cl::NDRange(2, 2);

229

}

230

else if(output->info()->dimension(1) == 196)

231

{

232

_lws_hint = cl::NDRange(1, 7);

}

else

{

_lws_hint = cl::NDRange(8, 8);

}

break;

default:

_lws_hint = cl::NullRange;

Anthony Barbier

fcd52fb

2017-11-28 10:31:43 +0000

[diff] [blame]

241

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

242

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

243

ElementsProcessed num_elements_processed{};

244

245

// Configure kernel window

Gian Marco Iodice

2018-05-08 12:01:57 +0100

[diff] [blame]

246

auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

247

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

248

ICLKernel::configure(win_config.second);

249

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

250

// Create build options

251

CLBuildOptions build_opts;

252

build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));

253

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

254

// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

255

if(std::abs(1.0f - alpha) > 0.00001f)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

256

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

257

build_opts.add_option_if_else(is_data_type_fixed_point(data_type),

258

"-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),

259

"-DALPHA=" + float_to_string_with_full_precision(alpha));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

260

}

261

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

262

// Do not slide matrix B if _slide_matrix_b = false

263

build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));

264

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

265

const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;

266

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

267

std::string kernel_name;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

268

if(is_interleaved_transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

269

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

270

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

271

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

272

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

273

build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

274

build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));

275

build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));

276

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

277

if(is_data_type_float(data_type) && is_bifrost)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

278

{

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

279

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

280

}

281

else

282

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

283

kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

284

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

285

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

286

else // The input tensors have not been reshaped

287

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

288

build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));

Gian Marco Iodice

e52a300

2018-04-11 15:59:10 +0100

[diff] [blame]

289

build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

290

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

291

// Create kernels according to the architecture, data type and input size.

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

292

if(is_data_type_float(data_type) && is_bifrost)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

293

{

Gian Marco Iodice

e52a300

2018-04-11 15:59:10 +0100

[diff] [blame]

294

kernel_name = "gemm_mm_floating_point";

295

296

if(input0->info()->num_dimensions() != 1)

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

297

{

Gian Marco Iodice

e52a300

2018-04-11 15:59:10 +0100

[diff] [blame]

298

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";

299

}

300

else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)

301

{

302

// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and

303

// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.

304

// FC6 and FC7 of AlexNet and VGG-16).

305

kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";

Gian Marco Iodice

fd68311

2018-04-17 09:52:44 +0100

[diff] [blame]

306

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

307

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

308

// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels

309

// via exhaustive autotuning over a range of representative layer configurations.

310

_lws_hint = cl::NDRange(4);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

311

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

312

else if(is_data_type_fixed_point(data_type))

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

313

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

314

kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

315

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

316

else // (MIDGARD and F32) or (F16)

317

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

318

kernel_name = "gemm_mm_floating_point";

319

}

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

320

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));

321

build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

322

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

323

324

// Create kernel

325

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));

326

327

// Set config_id for enabling LWS tuning

328

_config_id = "gemm_";

329

_config_id += (is_interleaved_transposed ? "reshaped_" : "");

330

_config_id += lower_string(string_from_data_type(input0->info()->data_type()));

331

_config_id += "_";

332

_config_id += support::cpp11::to_string(output->info()->dimension(1));

333

_config_id += "_";

334

_config_id += support::cpp11::to_string(output->info()->dimension(0));

335

_config_id += "_";

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

336

_config_id += support::cpp11::to_string(output->info()->dimension(2));

337

_config_id += "_";

338

_config_id += support::cpp11::to_string(output->info()->dimension(3));

339

_config_id += "_";

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

340

_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

341

}

342

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

343

Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,

344

const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

345

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

346

// Note: num_elements_processed will be set in validate_and_configure_window()

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

347

ElementsProcessed num_elements_processed{};

348

ARM_COMPUTE_UNUSED(alpha);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

349

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

350

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),

351

input1->clone().get(),

352

output->clone().get(),

353

is_interleaved_transposed,

Gian Marco Iodice

2018-05-08 12:01:57 +0100

[diff] [blame]

354

reshape_info,

Georgios Pinitas

2017-12-07 16:47:52 +0000

[diff] [blame]

355

gpu_target,

356

num_elements_processed)

.first);

return Status{};

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

362

void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

363

{

364

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

365

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

366

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

367

if(_input1->info()->num_dimensions() < 3)

368

{

369

// The stride_z for matrix B must be zero if we do not slice

370

ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);

371

}

372

373

Window slice = window.first_slice_window_3D();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

374

Window slice_matrix_b = slice;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

375

376

slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));

377

slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

do

{

Window slice_b = slice;

382

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

383

// This scenario can happen when the matrix multiplication is used to perform a convolution operation

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

384

if(!_slide_matrix_b)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

385

{

386

slice_b = slice_matrix_b;

387

}

388

389

unsigned int idx = 0;

390

add_2D_tensor_argument(idx, _input0, slice);

391

add_2D_tensor_argument(idx, _input1, slice_b);

392

add_2D_tensor_argument(idx, _output, slice);

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

393

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));

394

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));

395

_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

396

enqueue(queue, *this, slice, _lws_hint);

397

}

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

398

while(window.slide_window_slice_3D(slice));

Anthony Barbier