Blame - src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-11-16 19:24:39 +0000

[diff] [blame]

48

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

49

execute_window_loop(window, [&](const Coordinates & id)

{

if(id.x() > width_b)

{

return;

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

55

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

56

// Note: Since the input are all positives, we can use uint32_t

57

// Accumulators for the block 0

uint32x4x4_t c0 =

{

{

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

}

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

67

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

68

auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());

69

auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());

70

auto vec_a_end_addr = vec_a + width_a;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

71

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

72

// This for loop performs 8 accumulations

73

for(; vec_a <= (vec_a_end_addr - 8);)

74

{

75

const uint8x8_t a00_u8 = vld1_u8(vec_a);

76

const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);

77

const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);

78

const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);

79

const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);

80

const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);

81

const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);

82

const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);

83

const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

84

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

85

// Convert a00_u8 to uint16_t and get the lower part

86

const uint16x4x2_t a00_u16 =

87

{

88

{

89

vget_low_u16(vmovl_u8(a00_u8)),

90

vget_high_u16(vmovl_u8(a00_u8))

}

};

const uint16x4x4_t b00_u16 =

95

{

96

{

97

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

98

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

99

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

100

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

}

};

const uint16x4x4_t b10_u16 =

105

{

106

{

107

vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),

108

vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),

109

vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),

110

vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))

}

};

const uint16x4x4_t b20_u16 =

115

{

116

{

117

vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),

118

vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),

119

vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),

120

vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))

}

};

const uint16x4x4_t b30_u16 =

125

{

126

{

127

vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),

128

vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),

129

vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),

130

vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))

}

};

const uint16x4x4_t b40_u16 =

135

{

136

{

137

vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),

138

vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),

139

vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),

140

vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))

}

};

const uint16x4x4_t b50_u16 =

145

{

146

{

147

vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),

148

vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),

149

vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),

150

vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))

}

};

const uint16x4x4_t b60_u16 =

155

{

156

{

157

vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),

158

vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),

159

vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),

160

vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))

}

};

const uint16x4x4_t b70_u16 =

165

{

166

{

167

vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),

168

vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),

169

vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),

170

vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))

}

};

// Accumulate 0:

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);

176

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);

177

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);

178

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);

179

180

// Accumulate 1:

181

c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);

182

c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);

183

c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);

184

c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);

185

186

// Accumulate 2:

187

c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);

188

c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);

189

c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);

190

c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);

191

192

// Accumulate 3:

193

c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);

194

c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);

195

c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);

196

c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);

197

198

// Accumulate 4:

199

c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);

200

c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);

201

c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);

202

c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);

203

204

// Accumulate 5:

205

c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);

206

c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);

207

c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);

208

c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);

209

210

// Accumulate 6:

211

c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);

212

c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);

213

c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);

214

c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);

215

216

// Accumulate 7:

217

c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);

218

c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);

219

c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);

220

c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);

221

222

vec_a += 8;

223

matrix_b += 8 * stride_b;

224

}

225

226

// This for loop performs the left-over accumulations

227

for(; vec_a < vec_a_end_addr;)

228

{

229

const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);

230

const uint8x16_t b00_u8 = vld1q_u8(matrix_b);

231

232

const uint16x4x4_t b00_u16 =

233

{

234

{

235

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

236

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

237

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

238

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

}

};

// Convert a00_u8 to uint16_t and get the lower part

243

const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));

244

245

// Accumulate 0:

246

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);

247

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);

248

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);

249

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);

250

251

vec_a += 1;

252

matrix_b += stride_b;

253

}

254

255

auto vec_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

256

if(id.x() < (width_out - 16))

257

{

258

vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));

259

vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));

260

vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));

261

vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));

}

else

{

auto left_over = width_out - id.x();

266

for(auto k = 0; k < 4 && left_over; ++k)

267

{

268

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

269

{

270

*(vec_out + k * 4 + j) = c0.val[k][j];

271

}

272

}

273

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

274

},

275

ina, inb, out);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

276

}

277

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

278

void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

279

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

280

execute_window_loop(window, [&](const Coordinates & id)

{

if(id.x() > width_b)

{

return;

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

286

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

287

// Accumulators for the block 0

int32x4x4_t c0 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

297

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

298

auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());

299

auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());

300

auto vec_a_end_addr = vec_a + width_a;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

301

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

302

// This for loop performs 8 accumulations

303

for(; vec_a <= (vec_a_end_addr - 8);)

304

{

305

const int8x8_t a00_s8 = vld1_s8(vec_a);

306

const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);

307

const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);

308

const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);

309

const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);

310

const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);

311

const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);

312

const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);

313

const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

314

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

315

// Convert a00_s8 to int16_t and get the lower part

316

const int16x4x2_t a00_s16 =

317

{

318

{

319

vget_low_s16(vmovl_s8(a00_s8)),

320

vget_high_s16(vmovl_s8(a00_s8))

321

}

322

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

323

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

324

const int16x4x4_t b00_s16 =

325

{

326

{

327

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

328

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

329

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

330

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

331

}

332

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

333

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

334

const int16x4x4_t b10_s16 =

335

{

336

{

337

vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),

338

vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),

339

vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),

340

vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))

341

}

342

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

343

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

344

const int16x4x4_t b20_s16 =

345

{

346

{

347

vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),

348

vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),

349

vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),

350

vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))

351

}

352

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

353

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

354

const int16x4x4_t b30_s16 =

355

{

356

{

357

vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),

358

vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),

359

vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),

360

vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))

361

}

362

};

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

363

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

364

const int16x4x4_t b40_s16 =

365

{

366

{

367

vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),

368

vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),

369

vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),

370

vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))

371

}

372

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

373

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

374

const int16x4x4_t b50_s16 =

375

{

376

{

377

vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),

378

vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),

379

vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),

380

vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))

381

}

382

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

383

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

384

const int16x4x4_t b60_s16 =

385

{

386

{

387

vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),

388

vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),

389

vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),

390

vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))

391

}

392

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

393

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

394

const int16x4x4_t b70_s16 =

395

{

396

{

397

vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),

398

vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),

399

vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),

400

vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))

}

};

// Accumulate 0:

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);

406

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);

407

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);

408

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);

409

410

// Accumulate 1:

411

c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);

412

c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);

413

c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);

414

c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);

415

416

// Accumulate 2:

417

c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);

418

c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);

419

c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);

420

c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);

421

422

// Accumulate 3:

423

c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);

424

c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);

425

c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);

426

c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);

427

428

// Accumulate 4:

429

c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);

430

c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);

431

c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);

432

c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);

433

434

// Accumulate 5:

435

c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);

436

c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);

437

c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);

438

c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);

439

440

// Accumulate 6:

441

c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);

442

c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);

443

c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);

444

c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);

445

446

// Accumulate 7:

447

c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);

448

c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);

449

c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);

450

c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);

451

452

vec_a += 8;

453

matrix_b += 8 * stride_b;

454

}

455

456

// This for loop performs the left-over accumulations

457

for(; vec_a < vec_a_end_addr;)

458

{

459

const int8x8_t a00_s8 = vld1_dup_s8(vec_a);

460

const int8x16_t b00_s8 = vld1q_s8(matrix_b);

461

462

const int16x4x4_t b00_s16 =

463

{

464

{

465

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

466

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

467

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

468

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

}

};

// Convert a00_s8 to uint16_t and get the lower part

473

const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));

474

475

// Accumulate 0:

476

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);

477

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);

478

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);

479

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);

480

481

vec_a += 1;

482

matrix_b += stride_b;

483

}

484

485

auto vec_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

486

if(id.x() < (width_out - 16))

487

{

488

vst1q_s32(vec_out + 0, c0.val[0]);

489

vst1q_s32(vec_out + 4, c0.val[1]);

490

vst1q_s32(vec_out + 8, c0.val[2]);

491

vst1q_s32(vec_out + 12, c0.val[3]);

}

else

{

auto left_over = width_out - id.x();

496

for(auto k = 0; k < 4 && left_over; ++k)

497

{

498

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

499

{

500

*(vec_out + k * 4 + j) = c0.val[k][j];

501

}

502

}

503

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

504

},

505

ina, inb, out);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

506

}

507

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

508

void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

509

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

510

const auto width_out = static_cast<int>(out_info.dimension(0));

511

const auto height_out = static_cast<int>(out_info.dimension(1));

512

const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();

513

execute_window_loop(window, [&](const Coordinates & id)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

514

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

515

const uint8_t *mtx_a0 = ina.ptr();

516

const uint8_t *mtx_b0 = inb.ptr();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

517

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

518

// Note: Since the input are all positives, we can use uint32_t

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

519

// Accumulators for the block 0

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

520

uint32x4x4_t c0 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

521

{

522

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 1

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

531

uint32x4x4_t c1 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

532

{

533

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 2

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

542

uint32x4x4_t c2 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

543

{

544

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 3

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

553

uint32x4x4_t c3 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

554

{

555

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

563

for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

564

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

565

const uint8x8_t a00_u8 = vld1_u8(mtx_a0);

566

const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

567

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

568

// Convert a00_u8 to uint16_t and get the lower part

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

569

const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

570

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

571

// Convert b00_s8 to uint16_t

572

const uint16x4x4_t b00_u16 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

573

{

574

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

575

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

576

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

577

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

578

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// 4x4 block 0

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

583

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);

584

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);

585

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);

586

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

587

588

// 4x4 block 1

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

589

c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);

590

c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);

591

c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);

592

c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

593

594

// 4x4 block 2

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

595

c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);

596

c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);

597

c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);

598

c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

599

600

// 4x4 block 3

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

601

c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);

602

c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);

603

c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);

604

c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

605

}

606

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

607

auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

608

609

if(id.y() < height_out && id.x() < (width_out - 16))

610

{

611

vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));

612

vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));

613

vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));

614

vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));

615

if(id.y() + 1 < height_out)

616

{

617

vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));

618

vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));

619

vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));

620

vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));

621

if(id.y() + 2 < height_out)

622

{

623

vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));

624

vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));

625

vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));

626

vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));

627

if(id.y() + 3 < height_out)

628

{

629

vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));

630

vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));

631

vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));

632

vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));

}

}

}

}

else

{

const auto left_over_value = width_out - id.x();

640

auto left_over = left_over_value;

641

for(auto k = 0; k < 4 && left_over; ++k)

642

{

643

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

644

{

645

*(mtx_out + k * 4 + j) = c0.val[k][j];

646

}

647

}

648

if(id.y() + 1 < height_out)

649

{

650

left_over = left_over_value;

651

for(auto k = 0; k < 4 && left_over; ++k)

652

{

653

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

654

{

655

*(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];

656

}

657

}

658

if(id.y() + 2 < height_out)

659

{

660

left_over = left_over_value;

661

for(auto k = 0; k < 4 && left_over; ++k)

662

{

663

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

664

{

665

*(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];

666

}

667

}

668

if(id.y() + 3 < height_out)

669

{

670

left_over = left_over_value;

671

for(auto k = 0; k < 4 && left_over; ++k)

672

{

673

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

674

{

675

*(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];

}

}

}

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

682

},

683

ina, inb, out);

684

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

685

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

686

void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

687

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

688

const auto width_out = static_cast<int>(out_info.dimension(0));

689

const auto height_out = static_cast<int>(out_info.dimension(1));

690

const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();

Michele Di Giorgio

93b75e0

2021-06-21 12:00:43 +0100

[diff] [blame]

691

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

692

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration

693

// All the values needed for computing a single 4x4 block will be read from consecutive memory positions

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

694

execute_window_loop(window, [&](const Coordinates & id)

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

695

{

696

auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());

697

auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());

698

699

// Note: Since the input are all positives, we can use uint32_t

700

// Accumulators for the block 0

int32x4x4_t c0 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 1

int32x4x4_t c1 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 2

int32x4x4_t c2 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 3

int32x4x4_t c3 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)

745

{

746

const int8x8_t a00_s8 = vld1_s8(mtx_a0);

747

const int8x16_t b00_s8 = vld1q_s8(mtx_b0);

748

749

// Convert a00_s8 to uint16_t and get the lower part

750

const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));

751

752

// Convert b00_s8 to int16_t

753

const int16x4x4_t b00_s16 =

754

{

755

{

756

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

757

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

758

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

759

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

}

};

// 4x4 block 0

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);

765

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);

766

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);

767

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);

768

769

// 4x4 block 1

770

c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);

771

c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);

772

c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);

773

c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);

774

775

// 4x4 block 2

776

c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);

777

c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);

778

c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);

779

c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);

780

781

// 4x4 block 3

782

c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);

783

c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);

784

c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);

785

c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);

786

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

787

auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

788

if(id.y() < height_out && id.x() < (width_out - 16))

789

{

790

vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);

791

vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);

792

vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);

793

vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);

794

if(id.y() + 1 < height_out)

795

{

796

vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);

797

vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);

798

vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);

799

vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);

800

if(id.y() + 2 < height_out)

801

{

802

vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);

803

vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);

804

vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);

805

vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);

806

if(id.y() + 3 < height_out)

807

{

808

vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);

809

vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);

810

vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);

811

vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);

}

}

}

}

else if(id.y() < height_out)

817

{

818

const auto left_over_value = width_out - id.x();

819

auto left_over = left_over_value;

820

for(auto k = 0; k < 4 && left_over; ++k)

821

{

822

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

823

{

824

*(mtx_out + k * 4 + j) = c0.val[k][j];

825

}

826

}

827

if(id.y() + 1 < height_out)

828

{

829

left_over = left_over_value;

830

for(auto k = 0; k < 4 && left_over; ++k)

831

{

832

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

833

{

834

*(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];

835

}

836

}

837

if(id.y() + 2 < height_out)

838

{

839

left_over = left_over_value;

840

for(auto k = 0; k < 4 && left_over; ++k)

841

{

842

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

843

{

844

*(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];

845

}

846

}

847

if(id.y() + 3 < height_out)

848

{

849

left_over = left_over_value;

850

for(auto k = 0; k < 4 && left_over; ++k)

851

{

852

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

853

{

854

*(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];

}

}

}

}

}

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

862

},

863

ina, inb, out);

864

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

865

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

866

Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

867

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

868

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);

869

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);

870

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

871

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

872

TensorShape in0_shape = src0->tensor_shape();

873

TensorShape in1_shape = src1->tensor_shape();

874

TensorShape out_shape = dst->tensor_shape();

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

875

876

// Check vector-by-matrix case

877

if(out_shape[1] == 1)

878

{

879

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");

}

else

{

in0_shape.collapse(2);

884

in1_shape.collapse(2);

885

out_shape.collapse(2);

886

887

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");

888

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");

Anthony Barbier

93b9bdb

2017-12-12 11:27:55 +0000

[diff] [blame]

889

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

890

}

891

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

892

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

893

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

894

} // namespace

895

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

896

void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

897

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

898

ARM_COMPUTE_UNUSED(src0);

899

ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);

900

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst));

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

901

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

902

TensorShape in1_shape = src1->tensor_shape();

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

903

in1_shape.collapse(2);

904

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

905

_slide_matrix_b = in1_shape[2] != 1;

906

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

907

constexpr unsigned int num_elems_processed_per_iteration_x = 16;

908

constexpr unsigned int num_elems_processed_per_iteration_y = 4;

909

910

Window win;

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

911

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

912

if((dst->dimension(1) == 1))

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

913

{

914

// Configure kernel window

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

915

win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

916

}

917

else

918

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

919

win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

920

}

921

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

922

ICpuKernel::configure(win);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

923

}

924

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

925

Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

926

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

927

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

928

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

929

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

930

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

931

void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

932

{

933

ARM_COMPUTE_UNUSED(info);

934

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

935

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

936

937

auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

938

auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

939

auto dst = tensors.get_tensor(TensorType::ACL_DST);

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

940

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

941

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

942

if((dst->info()->dimension(1) == 1))

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

943

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

944

const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));

945

const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));

946

const auto width_out = static_cast<int>(dst->info()->dimension(0));

947

const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

948

949

// The implementation computes 16 elements per iteration

950

const int window_start_x = 16 * info.thread_id;

951

const int window_step_x = 16 * info.num_threads;

952

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

953

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

954

955

Window win_out(window);

956

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

957

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

958

959

Window win_a(window);

960

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

961

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

962

963

Window win_b;

964

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

965

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

966

if(src1->info()->num_dimensions() >= 3)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

971

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

972

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

973

Iterator ina(src0, win_a);

974

Iterator inb(src1, win_b);

975

Iterator out(dst, win_out);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

976

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

977

switch(src0->info()->data_type())

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

978

{

979

case DataType::S8:

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

980

case DataType::QASYMM8_SIGNED:

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

981

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

982

vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

break;

}

case DataType::U8:

case DataType::QASYMM8:

987

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame]

988

vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Not supported");

994

break;

995

}

996

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

997

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

998

else

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

999

{

Manuel Bottini

2021-06-18 15:47:28 +0100

[diff] [blame]

1000

const size_t in_b_stride = src1->info()->strides_in_bytes()[1];

1001

const int width_b = src1->info()->dimension(0);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

1002

1003

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

1004

Window win_a(window);

1005

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

1006

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));

1007

1008

// Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix

1009

Window win_b;

1010

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

1011

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

1012

if(_slide_matrix_b)

Pablo Tello