Blame - src/core/CL/cl_kernels/softmax_layer.cl - ml/ComputeLibrary

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

199

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);

200

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

201

202

// Perform max reduction

203

#if VECTOR_SIZE == 16

204

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);

205

#endif /* VECTOR SIZE 16 END */

206

#if VECTOR_SIZE >= 8

207

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);

208

#endif /* VECTOR SIZE 8 END */

209

#if VECTOR_SIZE >= 4

210

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

211

#endif /* VECTOR SIZE 4 END */

212

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

213

// Store result

214

*((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;

/* Second section */

// Load max value of 1D logits vector (row)

219

DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));

220

221

// Set sum vector

222

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

223

sum1D = 0;

224

225

// Shift values, exp and sum

226

for(uint i = 0; i < width_; i++)

227

{

228

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

229

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

230

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

231

#ifdef BETA

232

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

233

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

234

#ifdef LOG_SOFTMAX

235

VSTORE(VECTOR_SIZE)

236

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

237

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

238

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

239

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

240

VSTORE(VECTOR_SIZE)

241

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

242

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

243

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

244

}

245

246

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

247

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

248

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));

249

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

250

#ifdef BETA

251

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

252

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

253

#ifdef LOG_SOFTMAX

254

VSTORE(VECTOR_SIZE)

255

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

256

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

257

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

258

data = select(0, data, widx);

259

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

260

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

261

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

262

data = select(0, data, widx);

263

VSTORE(VECTOR_SIZE)

264

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

265

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

266

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

267

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

268

269

// Perform sum reduction

270

#if VECTOR_SIZE == 16

271

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);

272

#endif /* VECTOR SIZE 16 END */

273

#if VECTOR_SIZE >= 8

274

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);

275

#endif /* VECTOR SIZE 8 END */

276

#if VECTOR_SIZE >= 4

277

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

278

#endif /* VECTOR SIZE 4 END */

279

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

280

281

// Calculate and store result

282

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

283

}

284

285

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

286

* then gets the exponent of each element as sums all elements across each row.

287

*

288

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

289

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

290

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

291

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

292

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

293

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

294

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

295

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

296

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

297

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

298

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

299

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

300

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

301

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

302

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

303

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

304

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

305

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

306

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

307

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

308

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

309

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

310

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

311

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

312

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

313

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

314

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

315

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

316

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

317

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

318

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

319

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

320

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

321

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

322

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

323

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

324

* @param[in] width Input image width

325

*/

326

__kernel void softmax_layer_max_shift_exp_sum_parallel(

327

TENSOR3D_DECLARATION(src),

328

TENSOR3D_DECLARATION(maxo),

329

TENSOR3D_DECLARATION(dst),

330

TENSOR3D_DECLARATION(sum),

331

uint width)

332

{

333

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

334

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

335

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

336

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

337

338

const uint lid = get_local_id(0);

#ifdef BETA

// Initialize beta

VEC_DATA_TYPE(DATA_TYPE, 4)

343

beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;

344

#endif /* BETA */

345

346

// Define one temporary vector per work-item.

347

__local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];

348

__local DATA_TYPE max_local;

349

350

__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);

351

VEC_DATA_TYPE(DATA_TYPE, 4)

352

max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;

353

// Number of elements per work-item.

354

const uint row = width / GRID_SIZE;

355

// Number of iterations per work-item.

356

const uint width_ = row >> 2;

357

// Calculate max of row

358

uint i = 0;

359

for(; i < width_; i++)

360

{

361

VEC_DATA_TYPE(DATA_TYPE, 4)

362

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

363

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

364

}

365

#ifdef NON_MULTIPLE_OF_GRID_SIZE

366

// How many work-items needed to complete the computation.

367

//TODO: Optimize this calculation (avoid %).

368

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

369

if(lid < boundary_workitems)

370

{

371

VEC_DATA_TYPE(DATA_TYPE, 4)

372

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

373

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

374

}

375

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

376

if(boundary_workitems == 0)

377

{

378

boundary_workitems = GRID_SIZE;

379

i--;

380

}

381

if(lid == (boundary_workitems - 1))

382

{

383

// Handle non multiple of 4

384

VEC_DATA_TYPE(DATA_TYPE, 4)

385

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

386

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

387

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

388

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);

389

}

390

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

391

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

392

tmp_local[lid] = max_val_vec;

393

394

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

401

}

402

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

409

}

410

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

417

}

418

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

425

}

426

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

433

}

434

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

441

}

442

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

449

}

450

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

455

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

456

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

457

max_local = max_val_vec.s0;

458

}

459

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

VEC_DATA_TYPE(DATA_TYPE, 4)

465

sum1D = 0;

466

DATA_TYPE max_val = max_local;

467

468

// Shift values, exp and sum

469

for(i = 0; i < width_; i++)

470

{

471

VEC_DATA_TYPE(DATA_TYPE, 4)

472

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

473

data = SUB_OP(data, max_val, DATA_TYPE, 4);

474

#ifdef BETA

475

data = MUL_OP(data, beta, DATA_TYPE, 4);

476

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

477

#ifdef LOG_SOFTMAX

478

VSTORE(4)

479

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

480

data = EXP_OP(data, DATA_TYPE, 4);

481

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

482

data = EXP_OP(data, DATA_TYPE, 4);

483

VSTORE(4)

484

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

485

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

486

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

487

}

488

#ifdef NON_MULTIPLE_OF_GRID_SIZE

489

//TODO: Optimize the calculation (avoid %).

490

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

491

if(lid < boundary_workitems)

492

{

493

VEC_DATA_TYPE(DATA_TYPE, 4)

494

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

495

data = SUB_OP(data, max_val, DATA_TYPE, 4);

496

#ifdef BETA

497

data = MUL_OP(data, beta, DATA_TYPE, 4);

498

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

499

#ifdef LOG_SOFTMAX

500

VSTORE(4)

501

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

502

data = EXP_OP(data, DATA_TYPE, 4);

503

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

504

data = EXP_OP(data, DATA_TYPE, 4);

505

VSTORE(4)

506

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

507

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

508

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

509

}

510

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

511

if(boundary_workitems == 0)

512

{

513

boundary_workitems = GRID_SIZE;

514

i--;

515

}

516

if(lid == (boundary_workitems - 1))

517

{

518

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

519

VEC_DATA_TYPE(DATA_TYPE, 4)

520

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

521

data = SUB_OP(data, max_val, DATA_TYPE, 4);

522

#ifdef BETA

523

data = MUL_OP(data, beta, DATA_TYPE, 4);

524

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

525

#ifdef LOG_SOFTMAX

526

VSTORE(4)

527

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

528

data = EXP_OP(data, DATA_TYPE, 4);

529

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

530

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

531

data = select(0, data, widx);

532

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

533

data = EXP_OP(data, DATA_TYPE, 4);

534

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

535

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

536

data = select(0, data, widx);

537

VSTORE(4)

538

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

539

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

540

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

541

}

542

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

543

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

544

tmp_local[lid] = sum1D;

545

546

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

553

}

554

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

561

}

562

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

569

}

570

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

577

}

578

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

585

}

586

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

593

}

594

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

601

}

602

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

607

// Perform max reduction

608

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

609

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

610

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

611

}

612

}