Blame - src/core/CL/cl_kernels/softmax_layer.cl - ml/ComputeLibrary

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

387

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);

388

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

389

390

// Perform max reduction

391

#if VECTOR_SIZE == 16

392

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);

393

#endif /* VECTOR SIZE 16 END */

394

#if VECTOR_SIZE >= 8

395

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);

396

#endif /* VECTOR SIZE 8 END */

397

#if VECTOR_SIZE >= 4

398

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

399

#endif /* VECTOR SIZE 4 END */

400

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

401

// Store result

402

*((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;

/* Second section */

// Load max value of 1D logits vector (row)

407

DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));

408

409

// Set sum vector

410

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

411

sum1D = 0;

412

413

// Shift values, exp and sum

414

for(uint i = 0; i < width_; i++)

415

{

416

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

417

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

418

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

419

#ifdef BETA

420

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

421

#endif /* BETA */

422

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

423

VSTORE(VECTOR_SIZE)

424

(data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

425

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

426

}

427

428

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

429

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

430

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));

431

data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);

432

#ifdef BETA

433

data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);

434

#endif /* BETA */

435

data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);

436

widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));

437

data = select(0, data, widx);

438

VSTORE(VECTOR_SIZE)

439

(data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));

440

sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);

441

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

442

443

// Perform sum reduction

444

#if VECTOR_SIZE == 16

445

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);

446

#endif /* VECTOR SIZE 16 END */

447

#if VECTOR_SIZE >= 8

448

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);

449

#endif /* VECTOR SIZE 8 END */

450

#if VECTOR_SIZE >= 4

451

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

452

#endif /* VECTOR SIZE 4 END */

453

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

454

455

// Calculate and store result

456

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

457

}

458

459

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

460

* then gets the exponent of each element as sums all elements across each row.

461

*

462

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

463

* @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4

464

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

465

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

466

*

467

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32

468

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

469

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

470

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

471

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

472

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

473

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

474

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

475

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

476

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

477

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

478

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

479

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

480

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

481

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

482

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

483

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

484

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

485

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

486

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

487

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

488

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

489

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

490

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

491

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

492

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

493

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

494

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

495

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

496

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

497

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

498

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

499

* @param[in] width Input image width

500

*/

501

__kernel void softmax_layer_max_shift_exp_sum_parallel(

502

TENSOR3D_DECLARATION(src),

503

TENSOR3D_DECLARATION(maxo),

504

TENSOR3D_DECLARATION(dst),

505

TENSOR3D_DECLARATION(sum),

506

uint width)

507

{

508

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

509

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

510

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

511

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

512

513

const uint lid = get_local_id(0);

#ifdef BETA

// Initialize beta

VEC_DATA_TYPE(DATA_TYPE, 4)

518

beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;

519

#endif /* BETA */

520

521

// Define one temporary vector per work-item.

522

__local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];

523

__local DATA_TYPE max_local;

524

525

__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);

526

VEC_DATA_TYPE(DATA_TYPE, 4)

527

max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;

528

// Number of elements per work-item.

529

const uint row = width / GRID_SIZE;

530

// Number of iterations per work-item.

531

const uint width_ = row >> 2;

532

// Calculate max of row

533

uint i = 0;

534

for(; i < width_; i++)

535

{

536

VEC_DATA_TYPE(DATA_TYPE, 4)

537

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

538

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

539

}

540

#ifdef NON_MULTIPLE_OF_GRID_SIZE

541

// How many work-items needed to complete the computation.

542

//TODO: Optimize this calculation (avoid %).

543

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

544

if(lid < boundary_workitems)

545

{

546

VEC_DATA_TYPE(DATA_TYPE, 4)

547

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

548

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

549

}

550

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

551

if(boundary_workitems == 0)

552

{

553

boundary_workitems = GRID_SIZE;

554

i--;

555

}

556

if(lid == (boundary_workitems - 1))

557

{

558

// Handle non multiple of 4

559

VEC_DATA_TYPE(DATA_TYPE, 4)

560

data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

561

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

562

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

563

max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);

564

}

565

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

566

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

567

tmp_local[lid] = max_val_vec;

568

569

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

576

}

577

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

584

}

585

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

592

}

593

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

600

}

601

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

608

}

609

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

616

}

617

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

624

}

625

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

630

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

631

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

632

max_local = max_val_vec.s0;

633

}

634

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

VEC_DATA_TYPE(DATA_TYPE, 4)

640

sum1D = 0;

641

DATA_TYPE max_val = max_local;

642

643

// Shift values, exp and sum

644

for(i = 0; i < width_; i++)

645

{

646

VEC_DATA_TYPE(DATA_TYPE, 4)

647

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

648

data = SUB_OP(data, max_val, DATA_TYPE, 4);

649

#ifdef BETA

650

data = MUL_OP(data, beta, DATA_TYPE, 4);

651

#endif /* BETA */

652

data = EXP_OP(data, DATA_TYPE, 4);

653

VSTORE(4)

654

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

655

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

656

}

657

#ifdef NON_MULTIPLE_OF_GRID_SIZE

658

//TODO: Optimize the calculation (avoid %).

659

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

660

if(lid < boundary_workitems)

661

{

662

VEC_DATA_TYPE(DATA_TYPE, 4)

663

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

664

data = SUB_OP(data, max_val, DATA_TYPE, 4);

665

#ifdef BETA

666

data = MUL_OP(data, beta, DATA_TYPE, 4);

667

#endif /* BETA */

668

data = EXP_OP(data, DATA_TYPE, 4);

669

VSTORE(4)

670

(data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));

671

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

672

}

673

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

674

if(boundary_workitems == 0)

675

{

676

boundary_workitems = GRID_SIZE;

677

i--;

678

}

679

if(lid == (boundary_workitems - 1))

680

{

681

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

682

VEC_DATA_TYPE(DATA_TYPE, 4)

683

data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

684

data = SUB_OP(data, max_val, DATA_TYPE, 4);

685

#ifdef BETA

686

data = MUL_OP(data, beta, DATA_TYPE, 4);

687

#endif /* BETA */

688

data = EXP_OP(data, DATA_TYPE, 4);

689

VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)

690

widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));

691

data = select(0, data, widx);

692

VSTORE(4)

693

(data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));

694

sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);

695

}

696

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

697

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

698

tmp_local[lid] = sum1D;

699

700

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);

707

}

708

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);

715

}

716

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);

723

}

724

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);

731

}

732

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);

739

}

740

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);

747

}

748

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);

755

}

756

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);

761

// Perform max reduction

762

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

763

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

764

*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;

765

}

766

}