Blame - src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs - ml/ComputeLibrary

2018-01-04 14:13:22 +0800

[diff] [blame]

115

116

TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);

117

TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);

118

#ifdef HAS_BIAS

119

TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);

#endif /* BIAS */

void main()

{

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

124

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);

125

ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

126

#ifdef HAS_BIAS

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

127

VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

128

#endif /* BIAS */

129

130

bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))

131

&& ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));

132

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(

133

gl_GlobalInvocationID.z)

134

* uint(width) * uint(height) * uint(dst_attrs.stride_y))));

135

// Linearize convolution elements

136

if(is_last_thread)

137

{

138

for(uint i = 0u; i < uint(total_filters); i = i + 2u)

139

{

140

vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);

141

vec2 s;

142

if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)

{

s.x = s0.x;

}

else

{

s.x = s0.y;

}

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));

151

152

vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);

153

if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)

{

s.y = s1.x;

}

else

{

s.y = s1.y;

}

STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);

162

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));

163

#ifdef HAS_BIAS

164

vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter);

165

STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);

166

TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x));

167

#endif /* HAS_BIAS */

168

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));

}

}

else

{

for(uint i = 0u; i < uint(total_filters); i = i + 2u)

174

{

175

vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);

176

vec2 s;

177

if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)

{

s.x = s0.x;

}

else

{

s.x = s0.y;

}

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));

186

187

vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);

188

if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)

{

s.y = s1.x;

}

else

{

s.y = s1.y;

}

STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);

197

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));

198

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));

}

}

}

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

203

#endif /* DATA_TYPE_FP32 */

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

204

#endif // RESHAPE_TO_COLUMNS

205

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

206

#ifdef IM2COL_GENERIC

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

207

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

208

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.

209

*

210

* @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

211

* @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx"

212

* @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx"

213

* @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"

214

* @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"

215

* @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

216

* @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

217

* @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.

218

*

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

219

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

220

* @param[in] src_attrs The attributes of the source tensor

221

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

222

* @param[in] dst_attrs The attributes of the destination tensor

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

223

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

224

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

225

*/

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

226

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

227

SHADER_PARAMS_DECLARATION

228

{

229

Tensor3DAttributes src_attrs;

230

ImageAttributes dst_attrs;

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

uint src_stride_w;

uint dst_stride_w;

};

#ifdef DATA_TYPE_FP32

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

236

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

237

TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);

238

TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

239

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

240

void main(void)

241

{

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

242

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

243

ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);

244

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

245

int xc = int(gl_GlobalInvocationID.x); // x coordinate in the convolved tensor

246

int yc = int(gl_GlobalInvocationID.y); // y coordinate in the convolved tensor

247

int ch = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map

248

int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

249

250

// Calculate input indeces

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

251

int xi = xc * STRIDE_X - PAD_LEFT;

252

int yi = yc * STRIDE_Y - PAD_TOP;

253

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w)));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

254

255

// Calculate output indeces

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

256

int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

257

int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

258

// sizeof is not available in GLES, so we'll use stride_x

259

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x));

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

260

261

uint src_pos = 0u;

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

262

263

// Linearize convolution elements

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

264

for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y)

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

265

{

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

266

for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x)))

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

267

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

268

#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

269

src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

270

STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));

271

#else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

272

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

273

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

274

STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

275

}

276

else

277

{

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

278

src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

279

STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

280

}

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

281

#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

}

}

#ifdef HAS_BIAS

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

286

if(ch == (KERNEL_DEPTH - 1))

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

287

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

288

STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

289

}

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

290

#endif /* HAS_BIAS */

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

291

}

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

292

293

#elif defined(DATA_TYPE_FP16)

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

294

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

295

TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);

296

TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);

297

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

298

#ifdef KERNEL_1x1

299

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

300

void main(void)

301

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

302

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

303

ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);

304

305

uint xc = gl_GlobalInvocationID.x;

306

uint yc = gl_GlobalInvocationID.y;

307

uint zc = gl_GlobalInvocationID.z;

308

uint ch = zc % uint(KERNEL_DEPTH); // input feature map

309

uint batch = zc / uint(KERNEL_DEPTH); // the batch

310

311

// Calculate input indeces

312

uint xi = xc;

313

uint yi = yc;

314

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z);

315

316

// Calculate output indeces

317

uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;

318

uint xo = ch * dst_element_count;

319

uint yo = xc + yc * uint(CONVOLVED_WIDTH);

320

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);

321

322

bool x_start_even = ((xc % 2u) == 0u);

323

bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u);

324

uint input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y);

uint tmp_left = 0u;

uint tmp_right = 0u;

if(ch % 2u != 0u)

{

return;

}

if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1))))

334

{

335

tmp_left = LOAD(src_ptr, input_pos);

336

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z);

337

tmp_right = LOAD(src_ptr, input_pos);

338

if(x_start_even)

339

{

340

tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u);

}

else

{

tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u);

345

}

346

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

347

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);

348

349

#ifdef HAS_BIAS

350

if(ch == (uint(KERNEL_DEPTH) - 2u))

351

{

352

mediump vec2 bias_vec = vec2(1.f, 0.f);

353

uint bias_u = packHalf2x16(bias_vec);

354

STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u);

355

}

356

#endif /* HAS_BIAS */

}

else

{

tmp_left = LOAD(src_ptr, input_pos);

361

if(x_start_even)

362

{

363

tmp_right = (tmp_left & 0xffffu);

}

else

{

tmp_right = (tmp_left >> 16u);

}

#ifdef HAS_BIAS

mediump vec2 bias_vec = vec2(0.f, 1.f);

372

uint bias_u = packHalf2x16(bias_vec);

373

tmp_right += (bias_u & 0xffff0000u);

374

#endif /* HAS_BIAS */

375

376

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

377

}

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

378

}

379

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

380

#else /* KERNEL_1x1 */

void main(void)

{

uint xc = gl_GlobalInvocationID.x;

385

uint yc = gl_GlobalInvocationID.y;

386

uint zc = gl_GlobalInvocationID.z;

387

uint ch = zc % uint(KERNEL_DEPTH); // input feature map

388

uint batch = zc / uint(KERNEL_DEPTH); // the batch

389

390

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

391

Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

392

ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);

393

394

// Calculate input indeces

395

uint src_element_count = src_attrs.step_x / src_attrs.stride_x;

396

uint xi = (xc * uint(STRIDE_X)) / src_element_count;

397

uint yi = yc * uint(STRIDE_Y);

398

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z);

399

400

// Calculate output indeces

401

uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;

402

uint xo = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count;

403

uint yo = xc + yc * uint(CONVOLVED_WIDTH);

404

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);

405

406

bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u);

407

bool z_start_even = ((ch % 2u) == 0u);

uint input_pos = 0u;

uint tmp = 0u;

uint tmp_left = 0u;

uint tmp_right = 0u;

// Linearize convolution elements

414

for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)

{

uint xstart = 0u;

uint xend = 0u;

// even col, even row

420

if(x_start_even)

421

{

422

if(((y - yi + ch) % 2u) == 0u)

423

{

424

for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))

425

{

426

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);

427

STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));

}

}

else

{

// 1st pair

if(!z_start_even && (y == yi))

434

{

435

// cross 2d feature map

436

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +

437

(ch - 1u) * src_attrs.stride_z);

}

else

{

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,

442

(xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);

443

}

444

tmp_right = LOAD(src_ptr, input_pos);

445

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);

446

tmp_left = LOAD(src_ptr, input_pos);

447

tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u);

448

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

449

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);

450

451

// remaining

452

for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))

453

{

454

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y);

455

tmp_left = LOAD(src_ptr, input_pos);

456

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);

457

tmp_right = LOAD(src_ptr, input_pos);

458

tmp_right = (tmp_left >> 16u) + (tmp_right << 16u);

459

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

}

}

}

else

{

if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even))

{

// 1st pair

if(y == yi)

{

// cross 2d feature map

471

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +

472

(ch - 1u) * src_attrs.stride_z);

}

else

{

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,

477

(xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);

478

}

479

480

tmp_right = LOAD(src_ptr, input_pos);

481

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);

482

tmp_left = LOAD(src_ptr, input_pos);

483

tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u);

484

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

485

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);

486

487

// remaining

488

for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))

489

{

490

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);

491

STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));

492

}

493

}

494

else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even))

495

{

496

// 1st pair

497

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);

498

tmp_right = LOAD(src_ptr, input_pos);

499

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y);

500

tmp_left = LOAD(src_ptr, input_pos);

501

tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);

502

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

503

TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);

504

505

// remaining

506

for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))

507

{

508

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);

509

tmp_right = LOAD(src_ptr, input_pos);

510

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y);

511

tmp_left = LOAD(src_ptr, input_pos);

512

tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);

513

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);

}

}

}

}

// NOTE: must handle last element manually instead of in loops

520

// to avoid write conflict across 2d boundary

521

if(ch == uint(KERNEL_DEPTH) - 1u)

522

{

523

uint x = xi + (uint(KERNEL_WIDTH) / 2u);

524

uint y = yi + uint(KERNEL_HEIGHT) - 1u;

525

input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);

526

tmp = LOAD(src_ptr, input_pos);

527

if(!x_start_even)

528

{

529

tmp = (tmp >> 16u) + (tmp << 16u);

}

#ifdef HAS_BIAS

mediump vec2 bias_vec = vec2(1.f, 1.f);

534

uint bias_u = packHalf2x16(bias_vec);

535

if(z_start_even)

536

{

537

tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u);

}

else

{

tmp = (bias_u & 0xffffu);

542

}

543

#endif /* HAS_BIAS */

544

545

STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);

}

}

#endif /* KERNEL_1x1 */

550

#else /* DATA_TYPE_FP32 */

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

551

#error Data type not supported

552

#endif /* DATA_TYPE_FP32 */

553

#endif /* IM2COL_GENERIC */

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

554

555

#ifdef IM2COL_REDUCED

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

556

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

557

/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation

558

*

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

559

* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

560

* @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.

561

*

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

562

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

563

* @param[in] src_attrs The attributes of the source tensor

564

* @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr

565

* @param[in] dst_attrs The attributes of the destination tensor

566

* @param[in] width The width of the input tensor

567

* @param[in] height The height of the input tensor

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

568

*/

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

569

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

570

SHADER_PARAMS_DECLARATION

571

{

572

Tensor3DAttributes src_attrs;

573

VectorAttributes dst_attrs;

uint width;

uint height;

};

#ifdef DATA_TYPE_FP32

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

579

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

580

TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);

581

TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);

582

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

583

void main(void)

584

{

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

585

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);

586

VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

587

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

588

uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);

589

uvec3 size = uvec3(gl_WorkGroupSize.xyz);

590

uint image_size = width * height;

591

uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x + pos.y * width + pos.z * image_size);

592

593

STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

594

595

#ifdef HAS_BIAS

596

// If it is the last thread in the 3 dimensional workgroup

597

if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))

598

{

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

599

tmp_out_offset += (dst_attrs.stride_x >> uint(2));

600

STORE(dst_ptr, tmp_out_offset, 1.f);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

601

}

602

#endif // HAS_BIAS

603

}

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

604

605

#elif defined(DATA_TYPE_FP16)

606

607

#if defined(IM2COL_REDUCED_8X)

608

TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);

609

TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, restrict);

610

#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */

611

TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);

612

TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, restrict);

613

#else /* IM2COL_REDUCED_8X */

614

TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);

615

TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);

616

#endif /* IM2COL_REDUCED_8X */

617

618

#if defined(IM2COL_REDUCED_GENERIC)

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

619

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

620

void main(void)

621

{

622

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);

623

Tensor3DIterator src_nostep_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

624

VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);

625

626

uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);

627

uvec3 size = uvec3(gl_WorkGroupSize.xyz);

628

uint image_size = width * height;

629

uint element_count = src_attrs.step_x / src_attrs.stride_x;

630

uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * element_count + pos.y * width + pos.z * image_size);

631

uint width_fp16 = (width + uint(1)) >> uint(1);

uint tmp;

// odd width

if(width % uint(2) != uint(0))

636

{

637

// even row

638

if((pos.y + pos.z * height) % uint(2) == uint(0))

639

{

steli01

2018-01-30 09:49:07 +0800

[diff] [blame]

640

// skip last element of each line to avoid write conflict except for last line

641

if((pos.x < (width / element_count)) || ((pos.y == gl_NumWorkGroups.y - 1u) && (pos.z == gl_NumWorkGroups.z - 1u)))

642

{

643

tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);

644

STORE(dst_ptr, tmp_out_offset, tmp);

645

}

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

}

else

{

// special op

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

650

uint tmp_left = uint(0);

651

uint tmp_right = uint(0);

652

tmp_right = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

653

if(pos.x == uint(0))

654

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

655

tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half

656

tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16));

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

657

}

658

else

659

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

660

tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)));

661

tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16)));

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

662

}

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

663

STORE(dst_ptr, tmp_out_offset, tmp_right);

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

}

}

else

{

tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);

669

STORE(dst_ptr, tmp_out_offset, tmp);

steli01

2018-01-30 09:49:07 +0800

[diff] [blame]

670

}

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

671

672

#ifdef HAS_BIAS

steli01

2018-01-30 09:49:07 +0800

[diff] [blame]

673

// If it is the last thread in the 3 dimensional workgroup

674

if(pos.x == (size.x - 1u) && pos.y == (size.y - 1u) && pos.z == (size.z - 1u))

675

{

676

tmp_out_offset += (dst_attrs.stride_x >> dst_shift);

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

677

steli01

2018-01-30 09:49:07 +0800

[diff] [blame]

678

// FIXME: need odd/even detection for tmp_out_offset?

679

mediump vec2 bias_vec = vec2(1.0f, 1.0f);

680

STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

681

}

steli01

2018-01-30 09:49:07 +0800

[diff] [blame]

682

#endif // HAS_BIAS

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

683

}

684

685

#else /* IM2COL_REDUCED_GENERIC */

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

686

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

687

void main(void)

688

{

689

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);

690

VectorIterator dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);

691

692

uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);

693

#if defined(IM2COL_REDUCED_8X)

694

uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE));

695

uvec4 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);

696

STORE(dst_ptr, tmp_out_offset, tmp);

697

#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */

698

uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE));

699

uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);

700

STORE(dst_ptr, tmp_out_offset, tmp);

701

#else /* IM2COL_REDUCED_8X */

702

uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE));

703

uint tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);

704

STORE(dst_ptr, tmp_out_offset, tmp);

705

#endif /* IM2COL_REDUCED_8X */

706

}

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

707

708

#endif /* IM2COL_REDUCED_GENERIC */

709

#else /* DATA_TYPE_FP32 */

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

710

#error Data type not supported

711

#endif /* DATA_TYPE_FP32 */

712

#endif /* IM2COL_REDUCED */

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

713

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

714

#ifdef COL2IM

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

715

#ifdef WIDTH_OUTPUT

716

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

717

/** This kernel performs a reshaping of the output of the convolution layer.

718

*

719

* @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"

720

*

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

721

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

722

* @param[in] src_attrs The attributes of the source tensor

723

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

724

* @param[in] dst_attrs The attributes of the destination tensor

725

* @param[in] dst_depth The length of the destination tensor in Z dimension

726

* @param[in] dst_strideZ The actual stride of the destination tensor in Z dimension

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

727

*/

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

728

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

729

SHADER_PARAMS_DECLARATION

730

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

731

Tensor3DAttributes src_attrs;

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

732

Tensor3DAttributes dst_attrs;

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

733

uint dst_depth;

734

uint dst_strideZ;

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

735

};

736

737

#ifdef DATA_TYPE_FP32

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

738

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

739

TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);

740

TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

741

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

742

void main(void)

743

{

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

744

Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

745

Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

746

Stephen Li

2018-01-04 14:13:22 +0800

[diff] [blame]

747

uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

748

TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ);

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

749

Michele Di Giorgio

2018-04-10 14:24:35 +0100

[diff] [blame]

750

STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

751

}

Anthony Barbier

2017-10-26 15:23:08 +0100

[diff] [blame]

752

zhenglin

2018-01-05 14:39:50 +0800

[diff] [blame]

753

#elif defined(DATA_TYPE_FP16)

754

Stephen Li