Blame - src/core/CL/cl_kernels/softmax_layer.cl - ml/ComputeLibrary

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

194

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);

195

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

196

197

// Perform max reduction

198

#if VECTOR_SIZE == 16

199

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);

200

#endif /* VECTOR SIZE 16 END */

201

#if VECTOR_SIZE >= 8

202

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);

203

#endif /* VECTOR SIZE 8 END */

204

#if VECTOR_SIZE >= 4

205

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

206

#endif /* VECTOR SIZE 4 END */

207

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

208

// Store result

209

*((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;

/* Second section */

// Load max value of 1D logits vector (row)

214

DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));

215

216

// Set sum vector

217

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

218

sum1D = 0;

219

220

// Shift values, exp and sum

221

for(uint i = 0; i < width_; i++)

222

{

223

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

224

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

225

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

226

#ifdef BETA

227

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

228

#endif /* BETA */

229

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

230

VSTORE(VECTOR_SIZE)

231

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

232

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

233

}

234

235

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

236

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

237

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));

238

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

239

#ifdef BETA

240

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

241

#endif /* BETA */

242

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

243

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

244

data = select(0, data, widx);

245

VSTORE(VECTOR_SIZE)

246

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

247

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

248

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

249

250

// Perform sum reduction

251

#if VECTOR_SIZE == 16

252

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);

253

#endif /* VECTOR SIZE 16 END */

254

#if VECTOR_SIZE >= 8

255

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);

256

#endif /* VECTOR SIZE 8 END */

257

#if VECTOR_SIZE >= 4

258

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

259

#endif /* VECTOR SIZE 4 END */

260

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

261

262

// Calculate and store result

263

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

264

}

265

266

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

267

* then gets the exponent of each element as sums all elements across each row.

268

*

269

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

270

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

271

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

272

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

273

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

274

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

275

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

276

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

277

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

278

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

279

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

280

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

281

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

282

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

283

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

284

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

285

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

286

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

287

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

288

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

289

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

290

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

291

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

292

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

293

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

294

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

295

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

296

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

297

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

298

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

299

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

300

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

301

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

302

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

303

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

304

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

305

* @param[in] width Input image width

306

*/

307

__kernel void softmax_layer_max_shift_exp_sum_parallel(

308

TENSOR3D_DECLARATION(src),

309

TENSOR3D_DECLARATION(maxo),

310

TENSOR3D_DECLARATION(dst),

311

TENSOR3D_DECLARATION(sum),

312

uint width)

313

{

314

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

315

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

316

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

317

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

318

319

const uint lid = get_local_id(0);

#ifdef BETA

// Initialize beta

VEC_DATA_TYPE(DATA_TYPE, 4)

324

beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;

325

#endif /* BETA */

326

327

// Define one temporary vector per work-item.

328

__local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];

329

__local DATA_TYPE max_local;

330

331

__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);

332

VEC_DATA_TYPE(DATA_TYPE, 4)

333

max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;

334

// Number of elements per work-item.

335

const uint row = width / GRID_SIZE;

336

// Number of iterations per work-item.

337

const uint width_ = row >> 2;

338

// Calculate max of row

339

uint i = 0;

340

for(; i < width_; i++)

341

{

342

VEC_DATA_TYPE(DATA_TYPE, 4)

343

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

344

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

345

}

346

#ifdef NON_MULTIPLE_OF_GRID_SIZE

347

// How many work-items needed to complete the computation.

348

//TODO: Optimize this calculation (avoid %).

349

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

350

if(lid < boundary_workitems)

351

{

352

VEC_DATA_TYPE(DATA_TYPE, 4)

353

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

354

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

355

}

356

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

357

if(boundary_workitems == 0)

358

{

359

boundary_workitems = GRID_SIZE;

360

i--;

361

}

362

if(lid == (boundary_workitems - 1))

363

{

364

// Handle non multiple of 4

365

VEC_DATA_TYPE(DATA_TYPE, 4)

366

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

367

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

368

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

369

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);

370

}

371

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

372

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

373

tmp_local[lid] = max_val_vec;

374

375

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

382

}

383

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

390

}

391

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

398

}

399

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

406

}

407

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

414

}

415

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

422

}

423

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

430

}

431

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

436

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

437

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

438

max_local = max_val_vec.s0;

439

}

440

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

VEC_DATA_TYPE(DATA_TYPE, 4)

446

sum1D = 0;

447

DATA_TYPE max_val = max_local;

448

449

// Shift values, exp and sum

450

for(i = 0; i < width_; i++)

451

{

452

VEC_DATA_TYPE(DATA_TYPE, 4)

453

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

454

data = SUB_OP(data, max_val, DATA_TYPE, 4);

455

#ifdef BETA

456

data = MUL_OP(data, beta, DATA_TYPE, 4);

457

#endif /* BETA */

458

data = EXP_OP(data, DATA_TYPE, 4);

459

VSTORE(4)

460

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

461

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

462

}

463

#ifdef NON_MULTIPLE_OF_GRID_SIZE

464

//TODO: Optimize the calculation (avoid %).

465

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

466

if(lid < boundary_workitems)

467

{

468

VEC_DATA_TYPE(DATA_TYPE, 4)

469

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

470

data = SUB_OP(data, max_val, DATA_TYPE, 4);

471

#ifdef BETA

472

data = MUL_OP(data, beta, DATA_TYPE, 4);

473

#endif /* BETA */

474

data = EXP_OP(data, DATA_TYPE, 4);

475

VSTORE(4)

476

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

477

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

478

}

479

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

480

if(boundary_workitems == 0)

481

{

482

boundary_workitems = GRID_SIZE;

483

i--;

484

}

485

if(lid == (boundary_workitems - 1))

486

{

487

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

488

VEC_DATA_TYPE(DATA_TYPE, 4)

489

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

490

data = SUB_OP(data, max_val, DATA_TYPE, 4);

491

#ifdef BETA

492

data = MUL_OP(data, beta, DATA_TYPE, 4);

493

#endif /* BETA */

494

data = EXP_OP(data, DATA_TYPE, 4);

495

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

496

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

497

data = select(0, data, widx);

498

VSTORE(4)

499

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

500

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

501

}

502

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

503

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

504

tmp_local[lid] = sum1D;

505

506

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

513

}

514

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

521

}

522

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

529

}

530

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

537

}

538

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

545

}

546

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

553

}

554

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

561

}

562

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

567

// Perform max reduction

568

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

569

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

570

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

571

}

572

}