Blame - src/core/GLES_COMPUTE/cs_shaders/gemm.cs - ml/ComputeLibrary

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

183

*

184

* @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA

185

*

186

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

187

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

188

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

189

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

190

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

191

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

192

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

193

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

194

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

195

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

196

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

197

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

198

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

199

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

200

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

201

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

202

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

203

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

*/

void main()

{

Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);

208

Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);

209

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

210

211

/* Compute address for matrix A and B */

212

src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);

213

src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);

214

215

/* Compute end row address for matrix B */

216

int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);

217

218

/* Reset accumulators */

219

vec4 c00 = vec4(0.0f);

220

vec4 c10 = vec4(0.0f);

221

vec4 c20 = vec4(0.0f);

222

vec4 c30 = vec4(0.0f);

223

224

// FIXME: loop unrolling really needed for GLES?

225

for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))

226

{

227

/* Load values from matrix A (interleaved) and matrix B (transposed) */

228

vec4 a0;

229

vec4 b0;

230

LOAD16(a0, src0, src0.current_offset);

231

LOAD16(b0, src1, src1.current_offset);

232

233

c00 += vec4(a0.x) * b0;

234

c10 += vec4(a0.y) * b0;

235

c20 += vec4(a0.z) * b0;

236

c30 += vec4(a0.w) * b0;

237

238

/* Load values from matrix A (interleaved) and matrix B (transposed) */

239

LOAD16(a0, src0, src0.current_offset + uint(4));

240

LOAD16(b0, src1, src1.current_offset + uint(4));

241

242

c00 += vec4(a0.x) * b0;

243

c10 += vec4(a0.y) * b0;

244

c20 += vec4(a0.z) * b0;

245

c30 += vec4(a0.w) * b0;

246

}

247

248

for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))

249

{

250

/* Load values from matrix A (interleaved) and matrix B (transposed) */

251

vec4 a0;

252

vec4 b0;

253

LOAD16(a0, src0, src0.current_offset);

254

LOAD16(b0, src1, src1.current_offset);

255

256

c00 += vec4(a0.x) * b0;

257

c10 += vec4(a0.y) * b0;

258

c20 += vec4(a0.z) * b0;

259

c30 += vec4(a0.w) * b0;

260

}

261

262

/* Multiply by the weight of matrix product */

263

c00 = c00 * vec4(ALPHA);

264

c10 = c10 * vec4(ALPHA);

265

c20 = c20 * vec4(ALPHA);

266

c30 = c30 * vec4(ALPHA);

267

268

/* Store 4x4 block */

269

STORE16(dst, offset(dst, 0, 0), c00);

270

STORE16(dst, offset(dst, 0, 1), c10);

271

STORE16(dst, offset(dst, 0, 2), c20);

272

STORE16(dst, offset(dst, 0, 3), c30);

273

}

274

#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */

275

276

#ifdef GEMM_MM_FLOATING_POINT

277

BUFFER_DECLARATION(src0, 1, float, readonly);

278

BUFFER_DECLARATION(src1, 2, float, readonly);

279

BUFFER_DECLARATION(dst, 3, float, writeonly);

280

281

layout(std140) uniform shader_params

282

{

283

IMAGE_PARAM_DECLARATION(src0);

284

IMAGE_PARAM_DECLARATION(src1);

285

IMAGE_PARAM_DECLARATION(dst);

286

};

287

288

/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

289

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

290

*

291

* @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA

292

*

293

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

294

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

295

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

296

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

297

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

298

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

299

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

300

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

301

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

302

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

303

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

304

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

305

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

306

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

307

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

308

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

309

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

310

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

*/

void main()

{

Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);

315

Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);

316

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

317

318

int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);

319

/* Compute the address for the vector A and matrix B */

320

src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);

321

src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);

322

323

/* Compute end row address for matrix A */

324

int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);

325

326

/* Reset accumulators */

327

vec4 acc0 = vec4(0.0f);

328

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

329

vec4 acc1 = vec4(0.0f);

330

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

331

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

332

vec4 acc2 = vec4(0.0f);

333

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

334

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

335

vec4 acc3 = vec4(0.0f);

336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

337

338

for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))

339

{

340

vec2 a0;

341

LOAD8(a0, src0, src0.current_offset);

342

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

343

vec2 a1;

344

LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));

345

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

346

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

347

vec2 a2;

348

LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));

349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

351

vec2 a3;

352

LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));

353

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

vec4 b0;

vec4 b1;

LOAD16(b0, src1, src1.current_offset);

358

LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));

359

360

acc0 += b0 * vec4(a0.x);

361

acc0 += b1 * vec4(a0.y);

362

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

363

acc1 += b0 * vec4(a1.x);

364

acc1 += b1 * vec4(a1.y);

365

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

366

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

367

acc2 += b0 * vec4(a2.x);

368

acc2 += b1 * vec4(a2.y);

369

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

370

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

371

acc3 += b0 * vec4(a3.x);

372

acc3 += b1 * vec4(a3.y);

373

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

374

}

375

376

for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))

377

{

378

// Load values from matrix A

379

float a0;

380

a0 = LOAD4(src0, src0.current_offset);

381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

382

float a1;

383

a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));

384

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

385

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

386

float a2;

387

a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));

388

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

389

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

390

float a3;

391

a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));

392

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

393

394

vec4 b0;

395

LOAD16(b0, src1, src1.current_offset);

396

397

acc0 += b0 * vec4(a0);

398

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

399

acc1 += b0 * vec4(a1);

400

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

401

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

402

acc2 += b0 * vec4(a2);

403

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

404

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

405

acc3 += b0 * vec4(a3);

406

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

407

}

408

409

/* Multiply by the weight of vector-matrix product */

410

acc0 = acc0 * vec4(ALPHA);

411

STORE16(dst, offset(dst, 0, 0), acc0);

412

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

413

acc1 = acc1 * vec4(ALPHA);

414

STORE16(dst, offset(dst, 0, 1), acc1);

415

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

416

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

417

acc2 = acc2 * vec4(ALPHA);

418

STORE16(dst, offset(dst, 0, 2), acc2);

419

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

420

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

421

acc3 = acc3 * vec4(ALPHA);

422

STORE16(dst, offset(dst, 0, 3), acc3);

423

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

424

}

425

#endif /* GEMM_MM_FLOATING_POINT */

426

427

#ifdef GEMM_MATRIXADDITION

428

BUFFER_DECLARATION(src, 1, float, readonly);

429

BUFFER_DECLARATION(dst, 2, float, restrict);

430

431

layout(std140) uniform shader_params

432

{

433

IMAGE_PARAM_DECLARATION(src);

434

IMAGE_PARAM_DECLARATION(dst);

435

};

436

437

/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

438

*

439

* @attention The beta's value need to be passed at compile time using BETA

440

*

441

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

442

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

443

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

444

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

445

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

446

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

447

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

448

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

449

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

450

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

451

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

452

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

*/

void main(void)

{

/* Compute source and destination addresses */

457

Image src = CONVERT_TO_IMAGE_STRUCT(src);

458

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

459

460

/* Load values from A x B */

vec4 alpha_ab;

vec4 c;

vec4 out1;

LOAD16(alpha_ab, dst, dst.current_offset);

466

LOAD16(c, src, src.current_offset);

467

468

/* Computes alpha * axb + beta * c */

469

out1 = alpha_ab + vec4(BETA * c);

470

471

/* Store final result in axb matrix */

472

STORE16(dst, dst.current_offset, out1);

473

}

474

#endif /* GEMM_MATRIXADDITION */

475

#elif defined(DATA_TYPE_FP16)

476

precision mediump float;

477

#ifdef GEMM_MM_FLOATING_POINT

478

BUFFER_DECLARATION(src0, 1, uint, readonly);

479

BUFFER_DECLARATION(src1, 2, uvec2, readonly);

480

BUFFER_DECLARATION(dst, 3, uvec2, writeonly);

481

482

layout(std140) uniform shader_params

483

{

484

IMAGE_PARAM_DECLARATION(src0);

485

IMAGE_PARAM_DECLARATION(src1);

486

IMAGE_PARAM_DECLARATION(dst);

487

};

488

489

/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

490

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

491

*

492

* @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA

493

*

494

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

495

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

496

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

497

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

498

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

499

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

500

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

501

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

502

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

503

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

504

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

505

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

506

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

507

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

508

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

509

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

510

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

511

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

*/

void main()

{

Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);

516

Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);

517

Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);

518

519

int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);

520

/* Compute the address for the vector A and matrix B */

521

src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));

522

src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;

523

524

/* Compute end row address for matrix A */

525

uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);

526

527

/* Reset accumulators */

528

vec4 acc0 = vec4(0.0f);

529

530

for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)

{

uint packed_a0;

vec2 a0;

GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);

536

a0 = vec2(unpackHalf2x16(packed_a0));

uvec2 packed_b0;

uvec2 packed_b1;

vec4 b0;

vec4 b1;

GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);

544

GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);

545

546

b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));

547

b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));

548

549

acc0 += b0 * vec4(a0.x);

550

acc0 += b1 * vec4(a0.y);

551

}

552

553

for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)

{

uint packed_a0;

vec2 a0;

GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);

559

a0 = vec2(unpackHalf2x16(packed_a0));

uvec2 packed_b0;

vec4 b0;

GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);

565

566

b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));

acc0 += b0 * (a0.x);

}

/* Multiply by the weight of vector-matrix product */

572

acc0 = acc0 * vec4(ALPHA);

573

574

uvec2 packed_d;

575

packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));

576

GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);

577

}

578

#endif /* GEMM_MM_FLOATING_POINT */

579

580

#ifdef GEMM_ACCUMULATE_BIASES

581

BUFFER_DECLARATION(accum, 1, uvec2, restrict);

582

BUFFER_DECLARATION(biases, 2, uvec2, readonly);

583

584

layout(std140) uniform shader_params

585

{

586

IMAGE_PARAM_DECLARATION(accum);

587

VECTOR_PARAM_DECLARATION(biases);

588

};

589

590

/** This kernel accumulates each row with the biases vector

591

*

592

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16

593

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

594

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

595

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

596

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

597

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

598

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

599

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

600

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

601

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

*/

void main(void)

{

Image accum = GC_CONVERT_TO_IMAGE_STRUCT(accum);

606

Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);

vec4 u[2];

uvec2 packed_s[2];

GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);

611

GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);

612

u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));

613

u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));

vec4 tmp;

tmp = u[0] + u[1];

packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));

618

GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);

619

}

620

#endif /* GEMM_ACCUMULATE_BIASES */

621

#else /* DATA_TYPE_F32 */

622

#error Data type not supported

623

#endif /* DATA_TYPE_F32 */