Blame - src/core/CL/cl_kernels/softmax_layer.cl - ml/ComputeLibrary

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

198

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);

199

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

200

201

// Perform max reduction

202

#if VECTOR_SIZE == 16

203

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);

204

#endif /* VECTOR SIZE 16 END */

205

#if VECTOR_SIZE >= 8

206

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);

207

#endif /* VECTOR SIZE 8 END */

208

#if VECTOR_SIZE >= 4

209

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

210

#endif /* VECTOR SIZE 4 END */

211

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

212

// Store result

213

*((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;

/* Second section */

// Load max value of 1D logits vector (row)

218

DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));

219

220

// Set sum vector

221

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

222

sum1D = 0;

223

224

// Shift values, exp and sum

225

for(uint i = 0; i < width_; i++)

226

{

227

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

228

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

229

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

230

#ifdef BETA

231

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

232

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

233

#ifdef LOG_SOFTMAX

234

VSTORE(VECTOR_SIZE)

235

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

236

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

237

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

238

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

239

VSTORE(VECTOR_SIZE)

240

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

241

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

242

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

243

}

244

245

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

246

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

247

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));

248

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

249

#ifdef BETA

250

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

251

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

252

#ifdef LOG_SOFTMAX

253

VSTORE(VECTOR_SIZE)

254

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

255

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

256

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

257

data = select(0, data, widx);

258

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

259

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

260

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

261

data = select(0, data, widx);

262

VSTORE(VECTOR_SIZE)

263

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

264

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

265

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

266

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

267

268

// Perform sum reduction

269

#if VECTOR_SIZE == 16

270

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);

271

#endif /* VECTOR SIZE 16 END */

272

#if VECTOR_SIZE >= 8

273

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);

274

#endif /* VECTOR SIZE 8 END */

275

#if VECTOR_SIZE >= 4

276

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

277

#endif /* VECTOR SIZE 4 END */

278

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

279

280

// Calculate and store result

281

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

282

}

283

284

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

285

* then gets the exponent of each element as sums all elements across each row.

286

*

287

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

288

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

289

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

290

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

291

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

292

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

293

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

294

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

295

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

296

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

297

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

298

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

299

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

300

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

301

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

302

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

303

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

304

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

305

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

306

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

307

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

308

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

309

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

310

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

311

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

312

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

313

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

314

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

315

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

316

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

317

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

318

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

319

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

320

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

321

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

322

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

323

* @param[in] width Input image width

324

*/

325

__kernel void softmax_layer_max_shift_exp_sum_parallel(

326

TENSOR3D_DECLARATION(src),

327

TENSOR3D_DECLARATION(maxo),

328

TENSOR3D_DECLARATION(dst),

329

TENSOR3D_DECLARATION(sum),

330

uint width)

331

{

332

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

333

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

334

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

335

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

336

337

const uint lid = get_local_id(0);

#ifdef BETA

// Initialize beta

VEC_DATA_TYPE(DATA_TYPE, 4)

342

beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;

343

#endif /* BETA */

344

345

// Define one temporary vector per work-item.

346

__local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];

347

__local DATA_TYPE max_local;

348

349

__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);

350

VEC_DATA_TYPE(DATA_TYPE, 4)

351

max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;

352

// Number of elements per work-item.

353

const uint row = width / GRID_SIZE;

354

// Number of iterations per work-item.

355

const uint width_ = row >> 2;

356

// Calculate max of row

357

uint i = 0;

358

for(; i < width_; i++)

359

{

360

VEC_DATA_TYPE(DATA_TYPE, 4)

361

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

362

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

363

}

364

#ifdef NON_MULTIPLE_OF_GRID_SIZE

365

// How many work-items needed to complete the computation.

366

//TODO: Optimize this calculation (avoid %).

367

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

368

if(lid < boundary_workitems)

369

{

370

VEC_DATA_TYPE(DATA_TYPE, 4)

371

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

372

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

373

}

374

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

375

if(boundary_workitems == 0)

376

{

377

boundary_workitems = GRID_SIZE;

378

i--;

379

}

380

if(lid == (boundary_workitems - 1))

381

{

382

// Handle non multiple of 4

383

VEC_DATA_TYPE(DATA_TYPE, 4)

384

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

385

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

386

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

387

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);

388

}

389

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

390

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

391

tmp_local[lid] = max_val_vec;

392

393

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

400

}

401

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

408

}

409

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

416

}

417

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

424

}

425

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

432

}

433

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

440

}

441

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

448

}

449

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

454

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

455

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

456

max_local = max_val_vec.s0;

457

}

458

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

VEC_DATA_TYPE(DATA_TYPE, 4)

464

sum1D = 0;

465

DATA_TYPE max_val = max_local;

466

467

// Shift values, exp and sum

468

for(i = 0; i < width_; i++)

469

{

470

VEC_DATA_TYPE(DATA_TYPE, 4)

471

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

472

data = SUB_OP(data, max_val, DATA_TYPE, 4);

473

#ifdef BETA

474

data = MUL_OP(data, beta, DATA_TYPE, 4);

475

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

476

#ifdef LOG_SOFTMAX

477

VSTORE(4)

478

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

479

data = EXP_OP(data, DATA_TYPE, 4);

480

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

481

data = EXP_OP(data, DATA_TYPE, 4);

482

VSTORE(4)

483

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

484

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

485

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

486

}

487

#ifdef NON_MULTIPLE_OF_GRID_SIZE

488

//TODO: Optimize the calculation (avoid %).

489

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

490

if(lid < boundary_workitems)

491

{

492

VEC_DATA_TYPE(DATA_TYPE, 4)

493

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

494

data = SUB_OP(data, max_val, DATA_TYPE, 4);

495

#ifdef BETA

496

data = MUL_OP(data, beta, DATA_TYPE, 4);

497

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

498

#ifdef LOG_SOFTMAX

499

VSTORE(4)

500

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

501

data = EXP_OP(data, DATA_TYPE, 4);

502

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

503

data = EXP_OP(data, DATA_TYPE, 4);

504

VSTORE(4)

505

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

506

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

507

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

508

}

509

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

510

if(boundary_workitems == 0)

511

{

512

boundary_workitems = GRID_SIZE;

513

i--;

514

}

515

if(lid == (boundary_workitems - 1))

516

{

517

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

518

VEC_DATA_TYPE(DATA_TYPE, 4)

519

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

520

data = SUB_OP(data, max_val, DATA_TYPE, 4);

521

#ifdef BETA

522

data = MUL_OP(data, beta, DATA_TYPE, 4);

523

#endif /* BETA */

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

524

#ifdef LOG_SOFTMAX

525

VSTORE(4)

526

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

527

data = EXP_OP(data, DATA_TYPE, 4);

528

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

529

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

530

data = select(0, data, widx);

531

#else /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

532

data = EXP_OP(data, DATA_TYPE, 4);

533

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

534

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

535

data = select(0, data, widx);

536

VSTORE(4)

537

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

Sang-Hoon Park

62eeb53

2019-10-29 13:13:19 +0000

[diff] [blame]

538

#endif /* LOG_SOFTMAX */

Chunosov

d6afedc

2017-11-06 22:09:45 +0700

[diff] [blame]

539

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

540

}

541

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

542

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

543

tmp_local[lid] = sum1D;

544

545

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

552

}

553

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

560

}

561

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

568

}

569

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

576

}

577

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

584

}

585

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

592

}

593

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

600

}

601

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

606

// Perform max reduction

607

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

608

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

609

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

610

}

611

}