Blame - src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp - ml/ComputeLibrary

2017-07-18 17:37:43 +0100

[diff] [blame]

{

return _border_size;

}

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

75

void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

76

{

Georgios Pinitas

2017-11-14 15:32:57 +0000

[diff] [blame]

77

ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

78

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

79

const unsigned int kernel_size = weights->info()->dimension(0);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

80

const DataType data_type = input->info()->data_type();

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

81

82

// Get convolved dimensions

Georgios Pinitas

2017-11-14 15:32:57 +0000

[diff] [blame]

83

TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

84

85

// Output auto inizialitation if not yet initialized

Anton Lokhmotov

af6204c

2017-11-08 09:34:19 +0000

[diff] [blame]

86

auto_init_if_empty(*output->info(),

87

output_shape,

88

1,

89

input->info()->data_type(),

90

input->info()->fixed_point_position(),

91

input->info()->quantization_info());

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

92

Georgios Pinitas

2017-11-14 15:32:57 +0000

[diff] [blame]

93

// Perform validation step

94

ARM_COMPUTE_ERROR_THROW_ON(CLDirectConvolutionLayerKernel::validate(input->info(),

95

weights->info(),

96

(biases != nullptr) ? biases->info() : nullptr,

97

output->info(),

98

conv_info));

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

99

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

100

_conv_stride_x = std::get<0>(conv_info.stride());

101

_conv_stride_y = std::get<1>(conv_info.stride());

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

102

_conv_pad_x = std::min(std::get<0>(conv_info.pad()), kernel_size / 2);

103

_conv_pad_y = std::min(std::get<1>(conv_info.pad()), kernel_size / 2);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

_input = input;

_weights = weights;

_output = output;

_biases = biases;

_border_size = BorderSize(_conv_pad_y, _conv_pad_x);

110

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

111

const GPUTarget gpu_target = get_arch_from_target(get_target());

Michalis Spyrou

def665a

2017-08-14 11:26:37 +0100

[diff] [blame]

112

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

113

std::stringstream kernel_name;

114

kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

115

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

116

CLBuildOptions build_options;

117

build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

118

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

119

if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))

120

{

121

build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));

122

123

kernel_name << "_f32_bifrost";

124

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

125

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

126

// Configure kernel window

127

Window win = calculate_max_window(*output->info());

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

128

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

129

unsigned int num_elems_read_per_iteration_x = 0;

130

unsigned int num_elems_read_per_iteration_y = 0;

131

unsigned int num_elems_written_per_iteration_x = 0;

132

unsigned int num_elems_written_per_iteration_y = 0;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

133

Anthony Barbier

2017-11-28 10:28:47 +0000

[diff] [blame]

134

// Through extensive experimentation with over 30 representative tensor

135

// shapes, we found a small number of local work size configurations

136

// that result in nearly optimal execution times. Selecting the right

137

// lws for a given shape, however, required a complex decision tree,

138

// until we constructed a simple feature as described below.

139

//

140

// We started from the number of multiply-accumulate operations for a

141

// convolution layer, which is equal to the product of the input

142

// dimensions 0..2 and the weights dimensions 0..2. Unfortunately,

143

// this resulted in ties between distinct shapes that required distinct

144

// lws configurations. Replacing the width of the input with the kernel

145

// size, however, resulted in nearly optimal predictions. We use underscores

146

// in variable names to indicate when they are intentionally misleading.

147

const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);

148

const size_t product_of_input_dimensions_ = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);

149

const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;

150

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

151

switch(kernel_size)

152

{

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

153

case 1:

154

{

155

num_elems_read_per_iteration_x = 4;

156

num_elems_read_per_iteration_y = 4;

157

num_elems_written_per_iteration_x = 4;

158

num_elems_written_per_iteration_y = 4;

Anthony Barbier

2017-11-28 10:28:47 +0000

[diff] [blame]

159

if(mega_ops_ < 1.f)

160

{

161

_lws_hint = cl::NDRange(1, 1, 8);

162

}

163

else if(mega_ops_ < 7.f)

164

{

165

_lws_hint = cl::NDRange(1, 1, 4);

}

else

{

_lws_hint = cl::NDRange(1, 1, 2);

170

}

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

171

break;

172

}

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

173

case 3:

174

{

175

num_elems_read_per_iteration_x = 6;

176

num_elems_read_per_iteration_y = 5;

177

num_elems_written_per_iteration_x = 4;

178

num_elems_written_per_iteration_y = 3;

Anthony Barbier

2017-11-28 10:28:47 +0000

[diff] [blame]

179

if(mega_ops_ < 1.f)

180

{

181

_lws_hint = cl::NDRange(1, 1, 8);

182

}

183

else if(mega_ops_ < 13.f)

184

{

185

_lws_hint = cl::NDRange(2, 1, 4);

186

}

187

else if(mega_ops_ < 50.f)

188

{

189

_lws_hint = cl::NDRange(3, 1, 4);

}

else

{

_lws_hint = cl::NDRange(2, 1, 6);

194

}

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

break;

}

case 5:

{

num_elems_read_per_iteration_x = 8;

200

num_elems_read_per_iteration_y = 6;

201

num_elems_written_per_iteration_x = 4;

202

num_elems_written_per_iteration_y = 2;

Anthony Barbier

2017-11-28 10:28:47 +0000

[diff] [blame]

203

if(mega_ops_ < 2.f || mega_ops_ > 80.f)

204

{

205

_lws_hint = cl::NDRange(2, 1, 4);

}

else

{

_lws_hint = cl::NDRange(2, 1, 8);

210

}

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");

216

}

217

}

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

218

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

219

// Calculate right and bottom border

220

const int input_width = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;

221

const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

222

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

223

// Create window and update padding

224

win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

225

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

226

AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);

227

AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);

228

AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

229

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

230

update_window_and_padding(win, input_access, weights_access, output_access);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

231

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

232

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));

233

234

ICLKernel::configure(win);

235

}

236

else

237

{

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

238

bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);

Anton Lokhmotov

af6204c

2017-11-08 09:34:19 +0000

[diff] [blame]

239

bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

240

DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

241

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

242

build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));

243

build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));

244

build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));

245

build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));

246

build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));

247

build_options.add_option_if(is_quantized_fixed_point,

248

std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));

249

build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

250

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

251

// Create kernel

252

_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),

253

build_options.options()));

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

254

255

// Configure kernel window

256

257

bool is_stride2 = ((kernel_size != 1) && (_conv_stride_x == 2));

258

259

const unsigned int num_elems_read_per_iteration_x = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);

260

const unsigned int num_elems_read_per_iteration_y = kernel_size;

261

const unsigned int num_elems_written_per_iteration_x = 8;

262

const unsigned int num_elems_written_per_iteration_y = 1;

263

264

// Calculate right and bottom border

265

const int input_width = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;

266

const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;

267

268

// Create window and update padding

269

Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));

270

271

AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);

272

AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);

273

AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);

274

275

update_window_and_padding(win, input_access, weights_access, output_access);

276

277

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));

278

279

ICLKernel::configure(win);

280

}

Gian Marco

de691f0

2017-09-08 16:13:11 +0100

[diff] [blame]

281

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

282

// Set static kernel arguments

Anton Lokhmotov

af6204c

2017-11-08 09:34:19 +0000

[diff] [blame]

283

if(is_data_type_quantized_asymmetric(data_type))

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

284

{

285

int output_multiplier = 0;

286

int output_shift = 0;

287

288

float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;

289

ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));

290

291

unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;

292

_kernel.setArg(idx++, -_input->info()->quantization_info().offset);

293

_kernel.setArg(idx++, -_weights->info()->quantization_info().offset);

294

_kernel.setArg(idx++, _output->info()->quantization_info().offset);

295

_kernel.setArg(idx++, output_multiplier);

296

_kernel.setArg(idx++, output_shift);

297

}

298

Gian Marco

de691f0

2017-09-08 16:13:11 +0100

[diff] [blame]

299

// Set config_id for enabling LWS tuning

300

_config_id = "direct_convolution_";

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

301

_config_id += lower_string(string_from_data_type(data_type));

Gian Marco

de691f0

2017-09-08 16:13:11 +0100

[diff] [blame]

302

_config_id += "_";

303

_config_id += support::cpp11::to_string(kernel_size);

304

_config_id += "_";

305

_config_id += support::cpp11::to_string(_conv_pad_x);

306

_config_id += "_";

307

_config_id += support::cpp11::to_string(_conv_pad_y);

308

_config_id += "_";

309

_config_id += support::cpp11::to_string(_conv_stride_x);

310

_config_id += "_";

311

_config_id += support::cpp11::to_string(_conv_stride_y);

312

_config_id += "_";

313

_config_id += support::cpp11::to_string(output->info()->dimension(0));

314

_config_id += "_";

315

_config_id += support::cpp11::to_string(output->info()->dimension(1));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

316

}

317

Georgios Pinitas

2017-11-14 15:32:57 +0000

[diff] [blame]

318

Error CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)

319

{

320

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);

321

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

322

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),

323

"Weights should have same width as length");

324

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 1 && weights->dimension(0) != 3 && weights->dimension(0) != 5,

325

"Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");

326

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(2) != input->dimension(2),

327

"Weights feature map dimension should match the respective input's one");

328

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),

329

"Only rectangular weights are supported!");

330

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4,

331

"Weights can be at most 4 dimensional");

332

ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3,

333

"Strides larger than 3 not supported for 1x1 convolution.");

334

ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 3 || weights->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2,

335

"Strides larger than 2 not supported for 3x3 convolution.");

336

337

if(biases != nullptr)

338

{

339

if(is_data_type_quantized_asymmetric(input->data_type()))

340

{

341

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);

346

}

347

ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),

348

"Biases size and number of input feature maps should match");

349

ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,

350

"Biases should be one dimensional");

351

}

352

353

// Checks performed when output is configured

354

if(output->total_size() != 0)

355

{

356

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),

357

get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));

358

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

359

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);

}

return Error{};

}

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

365

void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

366

{

367

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

368

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);

369

370

// Get initial windows

371

Window slice = window.first_slice_window_3D();

372

Window win_in = window;

373

374

win_in.adjust(Window::DimX, -_conv_pad_x, true);

375

win_in.adjust(Window::DimY, -_conv_pad_y, true);

376

win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);

377

win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);

378

379

Window slice_in = win_in.first_slice_window_3D();

380

381

unsigned int idx1 = 2 * num_arguments_per_3D_tensor();

382

add_3D_tensor_argument(idx1, _weights, slice);

383

384

if(_biases != nullptr)

385

{

386

Window slice_biases;

SiCong Li

86b5333

2017-08-23 11:02:43 +0100

[diff] [blame]

387

slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

388

add_1D_tensor_argument(idx1, _biases, slice_biases);

389

}

390

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

391

_kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));

392

steniu01