Blame - src/core/CL/cl_kernels/tile_helpers.h - ml/ComputeLibrary

#define OFFSET4D(base, data_type, x, y, z, w) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z + w * base##_stride_w)

97

98

/** Dot product integet 8bit function

99

*

100

* @note Performs: c += dot(a, b)

101

*

102

* @param[in] DST_DATA_TYPE Accumulator data type

103

* @param[in] K0 Number of accumulations

104

* @param[in] a OpenCL vector a

105

* @param[in] b OpenCL vector b

106

* @param[in] c Scalar variable c

107

*/

108

#define DOT_PRODUCT_INTEGER8(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c)

109

#define DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(DST_DATA_TYPE, a, b, c)

110

#define DOT_PRODUCT1_INTEGER8(DST_DATA_TYPE, a, b, c) \

111

({ \

112

c += (DST_DATA_TYPE)a * (DST_DATA_TYPE)b; \

113

})

114

#define DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c) \

115

({ \

116

c += (DST_DATA_TYPE)a.s0 * (DST_DATA_TYPE)b.s0; \

117

c += (DST_DATA_TYPE)a.s1 * (DST_DATA_TYPE)b.s1; \

118

})

119

#define DOT_PRODUCT3_INTEGER8(DST_DATA_TYPE, a, b, c) \

120

({ \

121

DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c); \

122

c += (DST_DATA_TYPE)a.s2 * (DST_DATA_TYPE)b.s2; \

123

})

124

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

125

#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val = arm_dot_acc((x), (y), (val));

126

#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

127

#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val += arm_dot((x), (y));

128

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

129

#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) \

130

({ \

131

val += (DST_DATA_TYPE)x.s0 * (DST_DATA_TYPE)y.s0; \

132

val += (DST_DATA_TYPE)x.s1 * (DST_DATA_TYPE)y.s1; \

133

val += (DST_DATA_TYPE)x.s2 * (DST_DATA_TYPE)y.s2; \

134

val += (DST_DATA_TYPE)x.s3 * (DST_DATA_TYPE)y.s3; \

135

})

136

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

137

#define DOT_PRODUCT8_INTEGER8(DST_DATA_TYPE, a, b, c) \

138

({ \

139

DOT_PRODUCT4_INTEGER8((a.lo), (b.lo), c); \

140

DOT_PRODUCT4_INTEGER8((a.hi), (b.hi), c); \

141

})

142

#define DOT_PRODUCT16_INTEGER8(DST_DATA_TYPE, a, b, c) \

143

({ \

144

DOT_PRODUCT8_INTEGER8((a.lo), (b.lo), c); \

145

DOT_PRODUCT8_INTEGER8((a.hi), (b.hi), c); \

146

})

147

148

/** Load a vector from global memory (tensor)

149

*

150

* @param[in] DATA_TYPE Data type

151

* @param[in] WIDTH Number of dst columns

152

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

153

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

154

* @param[in] TENSOR Tensor basename

155

* @param[in] X Starting X position

156

* @param[in] Y Starting Y position

157

* @param[in] STRIDE_Y Stride Y (in bytes)

158

*/

159

#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)

160

#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)

161

#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \

162

VLOAD(WIDTH) \

Gian Marco Iodice

a8903c8

2021-03-24 14:48:22 +0000

[diff] [blame]

163

(0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y)*STRIDE_Y))

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

164

#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))

165

166

/** Load a tile from global memory (tensor)

167

*

Gian Marco Iodice

0b76f7d

2021-04-08 17:20:00 +0100

[diff] [blame]

168

* @param[in] DATA_TYPE Data type

169

* @param[in] HEIGHT Number of dst rows

170

* @param[in] WIDTH Number of dst columns

171

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).

172

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

173

* @param[in] TENSOR Tensor basename

174

* @param[in] X Starting X position

175

* @param[in] Y Starting Y position

176

* @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).

177

* In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).

178

* In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y

179

* @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.

180

* @param[out] dst Output tile

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

181

*/

Gian Marco Iodice

0b76f7d

2021-04-08 17:20:00 +0100

[diff] [blame]

182

#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

183

({ \

184

LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \

185

{ \

Gian Marco Iodice

0b76f7d

2021-04-08 17:20:00 +0100

[diff] [blame]

186

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \

187

} \

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

188

})

189

190

/** Load a tile from global memory (tensor) using an indirect Y index tile

191

*

192

* @param[in] DATA_TYPE Data type

193

* @param[in] HEIGHT Number of dst rows

194

* @param[in] WIDTH Number of dst columns

195

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

196

* In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)

197

* @param[in] TENSOR Tensor basename

198

* @param[in] X Starting X position

199

* @param[in] STRIDE_Y Stride Y (in bytes)

200

* @param[in] indirect_y Indirect Y index tile

201

* @param[out] dst Output tile

202

*/

203

#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \

204

({ \

205

LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \

206

{ \

207

dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \

} \

})

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

211

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout

212

*

213

* @param[in] DATA_TYPE Data type

214

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

215

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

216

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

217

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

218

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

219

* @param[in] TENSOR Tensor basename

220

* @param[in] B Starting batch index

221

* @param[in] Y Starting Y index

222

* @param[in] X Starting X index

223

* @param[in] C Starting C index

224

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

225

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

226

* @param[in] STRIDE_Y Stride Y (in bytes)

227

* @param[out] dst Output tile

228

*/

Gian Marco Iodice

0b76f7d

2021-04-08 17:20:00 +0100

[diff] [blame]

229

#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

230

({ \

231

LOOP_UNROLLING(int, _yk, 0, (TILE_HEIGHT), 1) \

232

{ \

233

LOOP_UNROLLING(int, _xk, 0, (TILE_WIDTH), 1) \

234

{ \

235

int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \

236

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

237

int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \

238

if(_src_valid_y != 0) \

239

{ \

240

dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

Gian Marco Iodice

0b76f7d

2021-04-08 17:20:00 +0100

[diff] [blame]

} \

} \

} \

})

/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates

247

*

248

* @param[in] DATA_TYPE Data type

249

* @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension

250

* @param[in] TILE_WIDTH Number of elements to load from X (width) dimension

251

* @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension

252

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

253

* In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)

254

* @param[in] TENSOR Tensor basename

255

* @param[in] B Starting batch index

256

* @param[in] Y Starting Y index

257

* @param[in] X Starting X index

258

* @param[in] C Starting C index

259

* @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension

260

* @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension

261

* @param[in] STRIDE_Y Stride Y (in bytes)

262

* @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate

263

* @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate

264

* @param[out] dst Output tile

265

*/

266

#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \

267

({ \

268

LOOP_UNROLLING(int, _i, 0, (TILE_WIDTH * TILE_HEIGHT), 1) \

269

{ \

270

int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \

271

_src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \

272

int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \

273

if(_src_valid_y != 0) \

274

{ \

275

dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \

276

} \

277

} \

Gian Marco Iodice

534b889

2021-04-01 16:17:16 +0100

[diff] [blame]

278

})

279

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

280

/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store

281

*

282

* @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store

283

* @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones

284

*

285

* @param[in] DATA_TYPE Data type

286

* @param[in] HEIGHT Number of src rows

287

* @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false

288

* @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true

289

* @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported

290

* cl_image is not supported.

291

* @param[in] TENSOR Tensor basename

292

* @param[in] X Starting X position

293

* @param[in] STRIDE_Y Stride Y (in bytes)

294

* @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store

295

* @param[in] src Input tile

296

* @param[in] indirect_y Indirect Y index tile

297

*/

Gian Marco Iodice

a8903c8

2021-03-24 14:48:22 +0000

[diff] [blame]

298

#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \

299

({ \

300

if(WIDTH1_CONDITION) \

301

{ \

302

LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \

303

{ \

304

VSTORE_PARTIAL(WIDTH0, WIDTH1) \

305

(src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

} \

} \

else \

{ \

LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \

311

{ \

312

VSTORE(WIDTH0) \

313

(src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \

314

} \

315

} \

Gian Marco Iodice

5c9eed8

2021-03-19 11:26:20 +0000

[diff] [blame]

316

})

317

318

/** Offset correction for the QASYMM8 computation

319

*

320

* @param[in] ACC_DATA_TYPE Accumulator data type

321

* @param[in] M0 Number of src/dst rows

322

* @param[in] N0 Number of src/dst columns

323

* @param[in] K0 Number of src columns

324

* @param[in] SRC_OFFSET Source quantization offset

325

* @param[in] WEI_OFFSET Weights quantization shift

326

* @param[in] lhs LHS tile

327

* @param[in] rhs RHS tile

328

* @param[out] dst DST tile

329

*/

330

#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \

331

({ \

332

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

333

{ \

334

ACC_DATA_TYPE _tm = 0; \

335

LOOP_UNROLLING(int, _k0, 0, K0, 1) \

336

{ \

337

_tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \

338

} \

339

LOOP_UNROLLING(int, _n0, 0, N0, 1) \

340

{ \

341

dst[_m0].s[_n0] += _tm; \

342

LOOP_UNROLLING(int, _k0, 0, K0, 1) \

343

{ \

344

dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \

} \

} \

} \

})

/** Quantized the tile (ASYMMETRIC) with fixed-point scale

351

*

352

* @param[in] SRC_DATA_TYPE SRC data type

353

* @param[in] DST_DATA_TYPE DST data type

354

* @param[in] M0 Number of src/dst rows

355

* @param[in] N0 Number of src/dst columns

356

* @param[in] DST_OFFSET Quantization offset

357

* @param[in] DST_SHIFT Quantization shift

358

* @param[in] DST_MULTIPLIER Quantization multiplier

359

* @param[in] src Input tile

360

* @param[out] dst Output tile

361

*/

362

#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \

363

({ \

364

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

365

{ \

366

LOOP_UNROLLING(int, _n0, 0, N0, 1) \

367

{ \

368

SRC_DATA_TYPE _tmp = 0; \

369

if(DST_SHIFT < 0) \

370

{ \

371

_tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1); \

} \

else \

{ \

_tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1); \

376

} \

377

_tmp += DST_OFFSET; \

378

dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \

} \

} \

})

/** Conditional rowset (memset by row)

384

*

385

* @note Set the row to VALUE_TO_SET if the corresponding mask == 0

386

*

387

* @param[in] DATA_TYPE Data type

388

* @param[in] M0 Number of LHS rows

389

* @param[in] N0 Number of LHS columns

390

* @param[in] VALUE_TO_SET Value to set the row

391

* @param[in, out] a Input/output tile

392

* @param[out] mask Mask to check for setting the row to VALUE_TO_SET

393

*/

394

#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \

395

({ \

396

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

397

{ \

398

LOOP_UNROLLING(int, _n0, 0, N0, 1) \

399

{ \

400

a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \

} \

} \

})

Gian Marco Iodice

2021-03-24 14:48:22 +0000

[diff] [blame]

405

/** Element-wise activation

406

*

407

* @note Performs: activation(LHS) = DST

408

*

409

* @param[in] DATA_TYPE SRC/DST data type

410

* @param[in] M0 Number of SRC/DST rows

411

* @param[in] N0 Number of SRC/DST columns

412

* @param[in] ACTIVATION_TYPE Activation type

413

* @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)

414

* @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)

415

* @param[out] src SRC tile

416

* @param[out] dst DST tile

417

*/

418

#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \

419

({ \

420

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

421

{ \

422

dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \

} \

})

Gian Marco Iodice

2021-03-19 11:26:20 +0000

[diff] [blame]

426

/** Element-wise addition with a constant value

427

*

428

* @note Performs: LHS + constant = DST

429

*

430

* @param[in] DATA_TYPE LHS/RHS/DST data type

431

* @param[in] M0 Number of LHS rows

432

* @param[in] N0 Number of LHS columns

433

* @param[in] lhs LHS tile

434

* @param[in] rhs_constant Constant value

435

* @param[out] dst DST tile

436

*/

437

#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \

438

({ \

439

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

440

{ \

441

LOOP_UNROLLING(int, _n0, 0, N0, 1) \

442

{ \

443

dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \

} \

} \

})

/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)

449

*

450

* @note Performs: LHS + RHS[broadcasted] = DST

451

* @note Both tiles must have same data type

452

*

453

* @param[in] DATA_TYPE LHS/RHS/DST data type

454

* @param[in] M0 Number of LHS rows

455

* @param[in] N0 Number of LHS columns

456

* @param[in] lhs LHS tile

457

* @param[in] rhs RHS tile

458

* @param[out] dst DST tile

459

*/

460

#define T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, lhs, rhs, dst) \

461

({ \

462

LOOP_UNROLLING(int, _m0, 0, M0, 1) \

463

{ \

464

dst[_m0].v = lhs[_m0].v + rhs[0].v; \

} \

})

/** Matrix multiplication

469

*

470

* @note Performs: LHS X RHS + DST = DST

471

*

472

* @param[in] LHS_DATA_TYPE LHS tile data type

473

* @param[in] RHS_DATA_TYPE RHS tile data type

474

* @param[in] DST_DATA_TYPE RHS tile data type

475

* @param[in] M0 Number of LHS rows

476

* @param[in] N0 Number of RHS columns

477

* @param[in] K0 Number of LHS columns

478

* @param[in] LHS_LAYOUT LHS layout (T= transposed, NT= not transposed)

479

* @param[in] RHS_LAYOUT RHS layout (T= transposed, NT= not transposed)

480

* @param[in] lhs LHS tile

481

* @param[in] rhs RHS tile

482

* @param[in, out] dst DST tile

483

*/

484

#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

485

#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)

486