Blame - src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp - ml/ComputeLibrary

Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)

Georgios Pinitas

30902ed

2017-11-14 15:32:57 +0000

[diff] [blame]

65

{

66

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);

67

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

68

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),

69

"Weights should have same width as length");

70

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 1 && weights->dimension(0) != 3 && weights->dimension(0) != 5,

71

"Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");

72

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(2) != input->dimension(2),

73

"Weights feature map dimension should match the respective input's one");

74

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),

75

"Only rectangular weights are supported!");

76

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4,

77

"Weights can be at most 4 dimensional");

78

ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3,

79

"Strides larger than 3 not supported for 1x1 convolution.");

80

ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 3 || weights->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2,

81

"Strides larger than 2 not supported for 3x3 convolution.");

82

83

if(biases != nullptr)

84

{

85

if(is_data_type_quantized_asymmetric(input->data_type()))

86

{

87

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);

92

}

93

ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),

94

"Biases size and number of input feature maps should match");

95

ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,

96

"Biases should be one dimensional");

97

}

98

99

// Checks performed when output is configured

100

if(output->total_size() != 0)

101

{

102

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),

103

get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));

104

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

105

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);

106

}

107

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

108

return Status{};

Georgios Pinitas

30902ed

2017-11-14 15:32:57 +0000

[diff] [blame]

109

}

110

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

111

std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)

Giorgio Arena

5948634

2017-12-01 10:42:47 +0000

[diff] [blame]

112

{

113

const unsigned int kernel_size = weights->dimension(0);

114

const DataType data_type = input->data_type();

115

116

// Get convolved dimensions

117

TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);

118

119

// Output auto inizialitation if not yet initialized

120

// FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).

121

auto_init_if_empty(*output, output_shape,

122

1,

123

input->data_type(),

124

input->fixed_point_position(),

125

input->quantization_info());

126

127

unsigned int conv_stride_x = std::get<0>(conv_info.stride());

128

unsigned int conv_stride_y = std::get<1>(conv_info.stride());

129

unsigned int conv_pad_left = std::min(conv_info.pad_left(), kernel_size / 2);

130

unsigned int conv_pad_top = std::min(conv_info.pad_top(), kernel_size / 2);

131

unsigned int conv_pad_right = std::min(conv_info.pad_right(), kernel_size / 2);

132

unsigned int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);

133

134

unsigned int num_elems_read_per_iteration_x = 0;

135

unsigned int num_elems_read_per_iteration_y = 0;

136

unsigned int num_elems_written_per_iteration_x = 0;

137

unsigned int num_elems_written_per_iteration_y = 0;

138

139

Window win = Window();

140

bool window_changed = false;

141

142

if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))

143

{

144

// Configure kernel window

145

win = calculate_max_window(*output);

switch(kernel_size)

{

case 1:

{

num_elems_read_per_iteration_x = 4;

152

num_elems_read_per_iteration_y = 4;

153

num_elems_written_per_iteration_x = 4;

154

num_elems_written_per_iteration_y = 4;

break;

}

case 3:

{

num_elems_read_per_iteration_x = 6;

160

num_elems_read_per_iteration_y = 5;

161

num_elems_written_per_iteration_x = 4;

162

num_elems_written_per_iteration_y = 3;

break;

}

case 5:

{

num_elems_read_per_iteration_x = 8;

168

num_elems_read_per_iteration_y = 6;

169

num_elems_written_per_iteration_x = 4;

170

num_elems_written_per_iteration_y = 2;

break;

}

default:

{

ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");

}

}

}

else

{

bool is_stride2 = ((kernel_size != 1) && (conv_stride_x == 2));

182

183

num_elems_read_per_iteration_x = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);

184

num_elems_read_per_iteration_y = kernel_size;

185

num_elems_written_per_iteration_x = 8;

186

num_elems_written_per_iteration_y = 1;

187

}

188

189

// Calculate right and bottom border

190

int input_width = input->dimension(0) - kernel_size / 2 + conv_pad_right;

191

int input_height = input->dimension(1) - kernel_size / 2 + conv_pad_bottom;

192

193

// Add padding only if necessary or it would always result in a window_changed

194

if(input_width % num_elems_read_per_iteration_x > 0)

195

{

196

input_width += num_elems_read_per_iteration_x;

197

}

198

if(input_height % num_elems_read_per_iteration_y > 0)

199

{

200

input_height += num_elems_read_per_iteration_y;

201

}

202

203

// Create window and update padding

204

win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));

205

206

AccessWindowStatic input_access(input, -conv_pad_left, -conv_pad_top, input_width, input_height);

207

AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);

208

AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);

209

210

window_changed = update_window_and_padding(win, input_access, weights_access, output_access);

211

212

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

213

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

214

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Giorgio Arena

5948634

2017-12-01 10:42:47 +0000

[diff] [blame]

215

return std::make_pair(err, win);

}

} // namespace

CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()

220

: _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)

{

}

BorderSize CLDirectConvolutionLayerKernel::border_size() const

{

return _border_size;

}

void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)

230

{

231

ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

232

233

const unsigned int kernel_size = weights->info()->dimension(0);

234

const DataType data_type = input->info()->data_type();

235

236

// Get convolved dimensions

237

TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);

238

239

// Output auto inizialitation if not yet initialized

240

// FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).

241

auto_init_if_empty(*output->info(),

242

output_shape,

243

1,

244

input->info()->data_type(),

245

input->info()->fixed_point_position(),

246

input->info()->quantization_info());

247

248

// Perform validation step

249

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),

250

weights->info(),

251

(biases != nullptr) ? biases->info() : nullptr,

output->info(),

conv_info));

_conv_stride_x = std::get<0>(conv_info.stride());

256

_conv_stride_y = std::get<1>(conv_info.stride());

_input = input;

_weights = weights;

_output = output;

_biases = biases;

int conv_pad_left = std::min(conv_info.pad_left(), kernel_size / 2);

264

int conv_pad_top = std::min(conv_info.pad_top(), kernel_size / 2);

265

int conv_pad_right = std::min(conv_info.pad_right(), kernel_size / 2);

266

int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);

267

_border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);

268

269

const GPUTarget gpu_target = get_arch_from_target(get_target());

270

271

std::stringstream kernel_name;

272

kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;

273

274

CLBuildOptions build_options;

275

build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));

276

277

if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))

278

{

279

build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));

280

281

kernel_name << "_f32_bifrost";

282

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));

283

284

// Through extensive experimentation with over 30 representative tensor

285

// shapes, we found a small number of local work size configurations

286

// that result in nearly optimal execution times. Selecting the right

287

// lws for a given shape, however, required a complex decision tree,

288

// until we constructed a simple feature as described below.

289

//

290

// We started from the number of multiply-accumulate operations for a

291

// convolution layer, which is equal to the product of the input

292

// dimensions 0..2 and the weights dimensions 0..2. Unfortunately,

293

// this resulted in ties between distinct shapes that required distinct

294

// lws configurations. Replacing the width of the input with the kernel

295

// size, however, resulted in nearly optimal predictions. We use underscores

296

// in variable names to indicate when they are intentionally misleading.

297

const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);

298

const size_t product_of_input_dimensions_ = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);

299

const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;

switch(kernel_size)

{

case 1:

{

if(mega_ops_ < 1.f)

{

_lws_hint = cl::NDRange(1, 1, 8);

308

}

309

else if(mega_ops_ < 7.f)

310

{

311

_lws_hint = cl::NDRange(1, 1, 4);

}

else

{

_lws_hint = cl::NDRange(1, 1, 2);

}

break;

}

case 3:

{

if(mega_ops_ < 1.f)

{

_lws_hint = cl::NDRange(1, 1, 8);

324

}

325

else if(mega_ops_ < 13.f)

326

{

327

_lws_hint = cl::NDRange(2, 1, 4);

328

}

329

else if(mega_ops_ < 50.f)

330

{

331

_lws_hint = cl::NDRange(3, 1, 4);

}

else

{

_lws_hint = cl::NDRange(2, 1, 6);

}

break;

}

case 5:

{

if(mega_ops_ < 2.f || mega_ops_ > 80.f)

342

{

343

_lws_hint = cl::NDRange(2, 1, 4);

}

else

{

_lws_hint = cl::NDRange(2, 1, 8);

}

break;

}

default:

{

ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");

}

}

}

else

{

bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);

360

bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);

361

DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;

362

363

build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));

364

build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));

365

build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));

366

build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));

367

build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));

368

build_options.add_option_if(is_quantized_fixed_point,

369

std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));

370

build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));

371

372

// Create kernel

373

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),

374

build_options.options()));

375

}

376

377

// Configure kernel window

378

auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);

379

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

380

ICLKernel::configure(win_config.second);

381

382

// Set static kernel arguments

383

if(is_data_type_quantized_asymmetric(data_type))

384

{

385

int output_multiplier = 0;

386

int output_shift = 0;

387

388

float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;

389

ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));

390

391

unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;

392

_kernel.setArg(idx++, -_input->info()->quantization_info().offset);

393

_kernel.setArg(idx++, -_weights->info()->quantization_info().offset);

394

_kernel.setArg(idx++, _output->info()->quantization_info().offset);

395

_kernel.setArg(idx++, output_multiplier);

396

_kernel.setArg(idx++, output_shift);

397

}

398

399

// Set config_id for enabling LWS tuning

400

_config_id = "direct_convolution_";

401

_config_id += lower_string(string_from_data_type(data_type));

402

_config_id += "_";

403

_config_id += support::cpp11::to_string(kernel_size);

404

_config_id += "_";

405

_config_id += support::cpp11::to_string(conv_pad_left);

406

_config_id += "_";

407

_config_id += support::cpp11::to_string(conv_pad_top);

408

_config_id += "_";

409

_config_id += support::cpp11::to_string(conv_pad_right);

410

_config_id += "_";

411

_config_id += support::cpp11::to_string(conv_pad_bottom);

412

_config_id += "_";

413

_config_id += support::cpp11::to_string(_conv_stride_x);

414

_config_id += "_";

415

_config_id += support::cpp11::to_string(_conv_stride_y);

416

_config_id += "_";

417

_config_id += support::cpp11::to_string(output->info()->dimension(0));

418

_config_id += "_";

419

_config_id += support::cpp11::to_string(output->info()->dimension(1));

420

}

421

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

422

Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,

423

const GPUTarget target)

Giorgio Arena

5948634

2017-12-01 10:42:47 +0000

[diff] [blame]

424

{

425

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));

426

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);

427

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame^]

428

return Status{};

Giorgio Arena

5948634

2017-12-01 10:42:47 +0000

[diff] [blame]

429