Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-22 20:42:13 +0700

[diff] [blame]

383

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

384

*

385

* The final result is:

386

*

387

* mm_result[i][k] = mm_result[i][k] +

388

* (sum_col[k] * A_OFFSET) +

389

* (sum_row[i] * B_OFFSET) +

390

* (K_OFFSET)

391

*

392

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

393

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

394

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

395

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

396

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

397

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

398

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

399

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

400

* @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

401

* @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)

402

* @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)

403

* @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)

404

* @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

405

* @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor

406

* @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

407

* @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)

408

* @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)

409

* @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)

410

* @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

411

* @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor

412

*/

413

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

414

#if defined(A_OFFSET)

415

,

416

IMAGE_DECLARATION(sum_col)

417

#endif // defined(A_OFFSET)

418

#if defined(B_OFFSET)

419

,

420

IMAGE_DECLARATION(sum_row)

421

#endif // defined(B_OFFSET)

422

)

423

{

424

Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);

425

426

int16 a_offset_s32 = (int16)0;

427

int16 b_offset_s32 = (int16)0;

428

429

#if defined(A_OFFSET)

430

Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);

431

432

// Compute the offset contribution due to A_OFFSET

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

433

#if defined(SUM_COL_HAS_BATCHES)

434

a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));

435

#else // defined(MATRIX_B_HAS_BATCHES)

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

436

a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

437

#endif // defined(MATRIX_B_HAS_BATCHES)

438

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

439

a_offset_s32 *= (int16)A_OFFSET;

440

#endif // defined(A_OFFSET)

441

442

#if defined(B_OFFSET)

443

Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);

444

445

// Compute the offset contribution due to B_OFFSET

446

b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));

447

b_offset_s32 *= (int16)B_OFFSET;

448

#endif // defined(B_OFFSET)

449

450

const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;

451

452

int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);

453

454

// Add the offset terms to GEMM's result

455

in_s32 += offset_term_s32;

456

457

// Store the result with the offset contribution

458

vstore16(in_s32, 0, (__global int *)mm_result.ptr);

459

}

460

#endif // defined(K_OFFSET)

461

462

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

463

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

464

*

465

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

466

* The following computations will be performed by the kernel:

467

*

468

* -# Add offset terms to final result

469

* -# Multiply each entry of result by result_mult_int

470

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

471

* -# Shift the int32 accumulator by result_shift

472

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

473

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

474

*

475

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

476

*

477

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

478

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

479

* These values can be used to implement "rectified linear unit" activation functions

480

*

481

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

482

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

483

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

484

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

485

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

486

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

487

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

488

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

489

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

490

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

491

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

492

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

493

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

494

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

495

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

496

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

497

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

498

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

499

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

500

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

501

*/

502

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

503

#if defined(ADD_BIAS)

504

VECTOR_DECLARATION(biases),

505

#endif // defined(ADD_BIAS)

506

TENSOR3D_DECLARATION(dst))

507

{

508

// Compute source and destination addresses

509

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

510

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

511

#if defined(ADD_BIAS)

512

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

513

#endif // defined(ADD_BIAS)

514

515

int16 input_values = vload16(0, (__global int *)src.ptr);

516

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

517

// Add the offset terms to GEMM's result

518

input_values += (int16)RESULT_OFFSET;

519

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

520

#if defined(ADD_BIAS)

521

// Add bias

522

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

523

input_values += (int16)biases_values;

524

#endif // defined(ADD_BIAS)

525

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

526

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

527

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

528

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

529

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

530

531

uchar16 res = convert_uchar16_sat(input_values);

532

533

#if defined(MIN_BOUND)

534

res = max(res, (uchar16)MIN_BOUND);

535

#endif // defined(MIN_BOUND)

536

#if defined(MAX_BOUND)

537

res = min(res, (uchar16)MAX_BOUND);

538

#endif // defined(MAX_BOUND)

539

540

// Store the result

541

vstore16(res, 0, dst.ptr);

542

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

543

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

544

545

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

546

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

547

*

548

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

549

* The following computations will be performed by the kernel:

550

*

551

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

552

* -# Add bias to final result if bias tensor is not a nullptr

553

* -# Round to nearest division by a power-of-two using result_shift

554

* -# Add offset to each result

555

* -# Clamp the value between the specified min and max bounds

556

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

557

*

558

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

559

*

560

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

561

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

562

* These values can be used to implement "rectified linear unit" activation functions

563

*

564

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

565

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

566

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

567

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

568

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

569

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

570

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

571

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

572

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

573

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

574

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

575

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

576

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

577

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

578

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

579

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

580

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

581

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

582

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

583

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

584

*/

585

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

586

#if defined(ADD_BIAS)

587

VECTOR_DECLARATION(biases),

588

#endif // defined(ADD_BIAS)

589

TENSOR3D_DECLARATION(dst))

590

{

591

// Compute source and destination addresses

592

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

593

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

594

#if defined(ADD_BIAS)

595

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

596

#endif // defined(ADD_BIAS)

597

598

int16 input_values = vload16(0, (__global int *)src.ptr);

599

600

#if defined(ADD_BIAS)

601

// Add bias

602

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

603

input_values += (int16)biases_values;

604

#endif // defined(ADD_BIAS)

605

606

// Multiply by result_mult_int and shift

607

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);

608

609

// Add the offset terms to GEMM's result

610

input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;

611

612

uchar16 res = convert_uchar16_sat(input_values);

613

614

#if defined(MIN_BOUND)

615

res = max(res, (uchar16)MIN_BOUND);

616

#endif // defined(MIN_BOUND)

617

#if defined(MAX_BOUND)

618

res = min(res, (uchar16)MAX_BOUND);

619

#endif // defined(MAX_BOUND)

620

621

// Store the result

622

vstore16(res, 0, dst.ptr);

623

}

Chunosov