Blame - src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-11-16 19:24:39 +0000

[diff] [blame]

43

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

44

execute_window_loop(window, [&](const Coordinates & id)

{

if(id.x() > width_b)

{

return;

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

50

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

51

// Note: Since the input are all positives, we can use uint32_t

52

// Accumulators for the block 0

uint32x4x4_t c0 =

{

{

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

}

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

62

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

63

auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());

64

auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());

65

auto vec_a_end_addr = vec_a + width_a;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

66

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

67

// This for loop performs 8 accumulations

68

for(; vec_a <= (vec_a_end_addr - 8);)

69

{

70

const uint8x8_t a00_u8 = vld1_u8(vec_a);

71

const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);

72

const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);

73

const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);

74

const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);

75

const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);

76

const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);

77

const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);

78

const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

79

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

80

// Convert a00_u8 to uint16_t and get the lower part

81

const uint16x4x2_t a00_u16 =

82

{

83

{

84

vget_low_u16(vmovl_u8(a00_u8)),

85

vget_high_u16(vmovl_u8(a00_u8))

}

};

const uint16x4x4_t b00_u16 =

90

{

91

{

92

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

93

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

94

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

95

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

}

};

const uint16x4x4_t b10_u16 =

100

{

101

{

102

vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),

103

vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),

104

vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),

105

vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))

}

};

const uint16x4x4_t b20_u16 =

110

{

111

{

112

vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),

113

vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),

114

vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),

115

vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))

}

};

const uint16x4x4_t b30_u16 =

120

{

121

{

122

vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),

123

vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),

124

vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),

125

vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))

}

};

const uint16x4x4_t b40_u16 =

130

{

131

{

132

vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),

133

vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),

134

vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),

135

vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))

}

};

const uint16x4x4_t b50_u16 =

140

{

141

{

142

vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),

143

vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),

144

vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),

145

vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))

}

};

const uint16x4x4_t b60_u16 =

150

{

151

{

152

vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),

153

vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),

154

vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),

155

vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))

}

};

const uint16x4x4_t b70_u16 =

160

{

161

{

162

vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),

163

vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),

164

vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),

165

vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))

}

};

// Accumulate 0:

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);

171

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);

172

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);

173

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);

174

175

// Accumulate 1:

176

c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);

177

c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);

178

c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);

179

c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);

180

181

// Accumulate 2:

182

c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);

183

c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);

184

c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);

185

c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);

186

187

// Accumulate 3:

188

c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);

189

c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);

190

c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);

191

c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);

192

193

// Accumulate 4:

194

c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);

195

c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);

196

c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);

197

c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);

198

199

// Accumulate 5:

200

c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);

201

c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);

202

c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);

203

c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);

204

205

// Accumulate 6:

206

c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);

207

c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);

208

c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);

209

c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);

210

211

// Accumulate 7:

212

c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);

213

c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);

214

c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);

215

c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);

216

217

vec_a += 8;

218

matrix_b += 8 * stride_b;

219

}

220

221

// This for loop performs the left-over accumulations

222

for(; vec_a < vec_a_end_addr;)

223

{

224

const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);

225

const uint8x16_t b00_u8 = vld1q_u8(matrix_b);

226

227

const uint16x4x4_t b00_u16 =

228

{

229

{

230

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

231

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

232

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

233

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

}

};

// Convert a00_u8 to uint16_t and get the lower part

238

const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));

239

240

// Accumulate 0:

241

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);

242

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);

243

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);

244

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);

245

246

vec_a += 1;

247

matrix_b += stride_b;

248

}

249

250

auto vec_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

251

if(id.x() < (width_out - 16))

252

{

253

vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));

254

vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));

255

vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));

256

vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));

}

else

{

auto left_over = width_out - id.x();

261

for(auto k = 0; k < 4 && left_over; ++k)

262

{

263

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

264

{

265

*(vec_out + k * 4 + j) = c0.val[k][j];

266

}

267

}

268

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

269

},

270

ina, inb, out);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

271

}

272

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

273

void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

274

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

275

execute_window_loop(window, [&](const Coordinates & id)

{

if(id.x() > width_b)

{

return;

}

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

281

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

282

// Accumulators for the block 0

int32x4x4_t c0 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

292

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

293

auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());

294

auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());

295

auto vec_a_end_addr = vec_a + width_a;

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

296

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

297

// This for loop performs 8 accumulations

298

for(; vec_a <= (vec_a_end_addr - 8);)

299

{

300

const int8x8_t a00_s8 = vld1_s8(vec_a);

301

const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);

302

const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);

303

const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);

304

const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);

305

const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);

306

const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);

307

const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);

308

const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

309

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

310

// Convert a00_s8 to int16_t and get the lower part

311

const int16x4x2_t a00_s16 =

312

{

313

{

314

vget_low_s16(vmovl_s8(a00_s8)),

315

vget_high_s16(vmovl_s8(a00_s8))

316

}

317

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

318

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

319

const int16x4x4_t b00_s16 =

320

{

321

{

322

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

323

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

324

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

325

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

326

}

327

};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

328

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

329

const int16x4x4_t b10_s16 =

330

{

331

{

332

vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),

333

vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),

334

vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),

335

vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))

336

}

337

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

338

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

339

const int16x4x4_t b20_s16 =

340

{

341

{

342

vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),

343

vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),

344

vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),

345

vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))

346

}

347

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

348

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

349

const int16x4x4_t b30_s16 =

350

{

351

{

352

vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),

353

vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),

354

vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),

355

vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))

356

}

357

};

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

358

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

359

const int16x4x4_t b40_s16 =

360

{

361

{

362

vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),

363

vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),

364

vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),

365

vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))

366

}

367

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

368

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

369

const int16x4x4_t b50_s16 =

370

{

371

{

372

vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),

373

vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),

374

vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),

375

vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))

376

}

377

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

378

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

379

const int16x4x4_t b60_s16 =

380

{

381

{

382

vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),

383

vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),

384

vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),

385

vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))

386

}

387

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

388

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

389

const int16x4x4_t b70_s16 =

390

{

391

{

392

vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),

393

vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),

394

vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),

395

vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))

}

};

// Accumulate 0:

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);

401

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);

402

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);

403

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);

404

405

// Accumulate 1:

406

c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);

407

c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);

408

c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);

409

c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);

410

411

// Accumulate 2:

412

c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);

413

c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);

414

c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);

415

c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);

416

417

// Accumulate 3:

418

c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);

419

c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);

420

c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);

421

c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);

422

423

// Accumulate 4:

424

c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);

425

c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);

426

c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);

427

c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);

428

429

// Accumulate 5:

430

c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);

431

c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);

432

c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);

433

c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);

434

435

// Accumulate 6:

436

c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);

437

c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);

438

c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);

439

c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);

440

441

// Accumulate 7:

442

c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);

443

c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);

444

c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);

445

c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);

446

447

vec_a += 8;

448

matrix_b += 8 * stride_b;

449

}

450

451

// This for loop performs the left-over accumulations

452

for(; vec_a < vec_a_end_addr;)

453

{

454

const int8x8_t a00_s8 = vld1_dup_s8(vec_a);

455

const int8x16_t b00_s8 = vld1q_s8(matrix_b);

456

457

const int16x4x4_t b00_s16 =

458

{

459

{

460

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

461

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

462

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

463

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

}

};

// Convert a00_s8 to uint16_t and get the lower part

468

const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));

469

470

// Accumulate 0:

471

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);

472

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);

473

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);

474

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);

475

476

vec_a += 1;

477

matrix_b += stride_b;

478

}

479

480

auto vec_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

481

if(id.x() < (width_out - 16))

482

{

483

vst1q_s32(vec_out + 0, c0.val[0]);

484

vst1q_s32(vec_out + 4, c0.val[1]);

485

vst1q_s32(vec_out + 8, c0.val[2]);

486

vst1q_s32(vec_out + 12, c0.val[3]);

}

else

{

auto left_over = width_out - id.x();

491

for(auto k = 0; k < 4 && left_over; ++k)

492

{

493

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

494

{

495

*(vec_out + k * 4 + j) = c0.val[k][j];

496

}

497

}

498

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

499

},

500

ina, inb, out);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

501

}

502

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

503

void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

504

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

505

const auto width_out = static_cast<int>(out_info.dimension(0));

506

const auto height_out = static_cast<int>(out_info.dimension(1));

507

const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();

508

execute_window_loop(window, [&](const Coordinates & id)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

509

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

510

const uint8_t *mtx_a0 = ina.ptr();

511

const uint8_t *mtx_b0 = inb.ptr();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

512

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

513

// Note: Since the input are all positives, we can use uint32_t

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

514

// Accumulators for the block 0

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

515

uint32x4x4_t c0 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

516

{

517

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 1

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

526

uint32x4x4_t c1 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

527

{

528

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 2

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

537

uint32x4x4_t c2 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

538

{

539

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// Accumulators for the block 3

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

548

uint32x4x4_t c3 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

549

{

550

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

558

for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

559

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

560

const uint8x8_t a00_u8 = vld1_u8(mtx_a0);

561

const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

562

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

563

// Convert a00_u8 to uint16_t and get the lower part

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

564

const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

565

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

566

// Convert b00_s8 to uint16_t

567

const uint16x4x4_t b00_u16 =

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

568

{

569

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

570

vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),

571

vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),

572

vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),

573

vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

// 4x4 block 0

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

578

c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);

579

c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);

580

c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);

581

c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

582

583

// 4x4 block 1

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

584

c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);

585

c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);

586

c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);

587

c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

588

589

// 4x4 block 2

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

590

c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);

591

c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);

592

c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);

593

c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

594

595

// 4x4 block 3

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

596

c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);

597

c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);

598

c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);

599

c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

600

}

601

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

602

auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

603

604

if(id.y() < height_out && id.x() < (width_out - 16))

605

{

606

vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));

607

vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));

608

vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));

609

vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));

610

if(id.y() + 1 < height_out)

611

{

612

vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));

613

vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));

614

vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));

615

vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));

616

if(id.y() + 2 < height_out)

617

{

618

vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));

619

vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));

620

vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));

621

vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));

622

if(id.y() + 3 < height_out)

623

{

624

vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));

625

vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));

626

vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));

627

vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));

}

}

}

}

else

{

const auto left_over_value = width_out - id.x();

635

auto left_over = left_over_value;

636

for(auto k = 0; k < 4 && left_over; ++k)

637

{

638

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

639

{

640

*(mtx_out + k * 4 + j) = c0.val[k][j];

641

}

642

}

643

if(id.y() + 1 < height_out)

644

{

645

left_over = left_over_value;

646

for(auto k = 0; k < 4 && left_over; ++k)

647

{

648

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

649

{

650

*(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];

651

}

652

}

653

if(id.y() + 2 < height_out)

654

{

655

left_over = left_over_value;

656

for(auto k = 0; k < 4 && left_over; ++k)

657

{

658

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

659

{

660

*(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];

661

}

662

}

663

if(id.y() + 3 < height_out)

664

{

665

left_over = left_over_value;

666

for(auto k = 0; k < 4 && left_over; ++k)

667

{

668

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

669

{

670

*(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];

}

}

}

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

677

},

678

ina, inb, out);

679

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

680

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

681

void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

682

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

683

const auto width_out = static_cast<int>(out_info.dimension(0));

684

const auto height_out = static_cast<int>(out_info.dimension(1));

685

const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

686

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW

687

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration

688

// All the values needed for computing a single 4x4 block will be read from consecutive memory positions

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

689

execute_window_loop(window, [&](const Coordinates & id)

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

690

{

691

auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());

692

auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());

693

694

// Note: Since the input are all positives, we can use uint32_t

695

// Accumulators for the block 0

int32x4x4_t c0 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 1

int32x4x4_t c1 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 2

int32x4x4_t c2 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 3

int32x4x4_t c3 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)

740

{

741

const int8x8_t a00_s8 = vld1_s8(mtx_a0);

742

const int8x16_t b00_s8 = vld1q_s8(mtx_b0);

743

744

// Convert a00_s8 to uint16_t and get the lower part

745

const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));

746

747

// Convert b00_s8 to int16_t

748

const int16x4x4_t b00_s16 =

749

{

750

{

751

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

752

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

753

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

754

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

}

};

// 4x4 block 0

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);

760

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);

761

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);

762

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);

763

764

// 4x4 block 1

765

c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);

766

c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);

767

c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);

768

c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);

769

770

// 4x4 block 2

771

c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);

772

c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);

773

c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);

774

c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);

775

776

// 4x4 block 3

777

c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);

778

c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);

779

c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);

780

c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);

781

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

782

auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

783

if(id.y() < height_out && id.x() < (width_out - 16))

784

{

785

vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);

786

vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);

787

vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);

788

vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);

789

if(id.y() + 1 < height_out)

790

{

791

vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);

792

vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);

793

vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);

794

vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);

795

if(id.y() + 2 < height_out)

796

{

797

vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);

798

vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);

799

vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);

800

vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);

801

if(id.y() + 3 < height_out)

802

{

803

vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);

804

vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);

805

vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);

806

vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);

}

}

}

}

else if(id.y() < height_out)

812

{

813

const auto left_over_value = width_out - id.x();

814

auto left_over = left_over_value;

815

for(auto k = 0; k < 4 && left_over; ++k)

816

{

817

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

818

{

819

*(mtx_out + k * 4 + j) = c0.val[k][j];

820

}

821

}

822

if(id.y() + 1 < height_out)

823

{

824

left_over = left_over_value;

825

for(auto k = 0; k < 4 && left_over; ++k)

826

{

827

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

828

{

829

*(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];

830

}

831

}

832

if(id.y() + 2 < height_out)

833

{

834

left_over = left_over_value;

835

for(auto k = 0; k < 4 && left_over; ++k)

836

{

837

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

838

{

839

*(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];

840

}

841

}

842

if(id.y() + 3 < height_out)

843

{

844

left_over = left_over_value;

845

for(auto k = 0; k < 4 && left_over; ++k)

846

{

847

for(auto j = 0; j < 4 && left_over; ++j, --left_over)

848

{

849

*(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];

}

}

}

}

}

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

857

},

858

ina, inb, out);

859

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

860

} // namespace

861

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

862

namespace

863

{

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

864

Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

865

{

Georgios Pinitas

dbdea0d

2019-10-16 19:21:40 +0100

[diff] [blame]

866

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);

Michele Di Giorgio

47a8990

2020-03-09 19:32:33 +0000

[diff] [blame]

867

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

868

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

869

870

TensorShape in0_shape = input0->tensor_shape();

871

TensorShape in1_shape = input1->tensor_shape();

872

TensorShape out_shape = output->tensor_shape();

873

874

// Check vector-by-matrix case

875

if(out_shape[1] == 1)

876

{

877

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");

}

else

{

in0_shape.collapse(2);

882

in1_shape.collapse(2);

883

out_shape.collapse(2);

884

885

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");

886

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");

Anthony Barbier

93b9bdb

2017-12-12 11:27:55 +0000

[diff] [blame]

887

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

888

}

889

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

890

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

891

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

892

} // namespace

893

894

NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()

895

: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)

{

}

void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)

900

{

901

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

902

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));

903

904

TensorShape in1_shape = input1->info()->tensor_shape();

905

in1_shape.collapse(2);

_input0 = input0;

_input1 = input1;

_output = output;

_slide_matrix_b = in1_shape[2] != 1;

911

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

912

constexpr unsigned int num_elems_processed_per_iteration_x = 16;

913

constexpr unsigned int num_elems_processed_per_iteration_y = 4;

Window win;

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

918

if((output->info()->dimension(1) == 1))

919

{

920

// Configure kernel window

921

win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));

922

923

Coordinates coord;

924

coord.set_num_dimensions(output->info()->num_dimensions());

925

output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));

}

else

{

win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

930

output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));

931

}

932

933

INEKernel::configure(win);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

934

}

935

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

936

Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

937

{

938

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

939

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

940

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

941

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

942

943

void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)

944

{

945

ARM_COMPUTE_UNUSED(info);

946

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

947

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

948

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

949

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path

950

if((_output->info()->dimension(1) == 1))

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

951

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

952

const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));

953

const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

954

const auto width_out = static_cast<int>(_output->info()->dimension(0));

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

955

const auto in_b_stride = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));

956

957

// The implementation computes 16 elements per iteration

958

const int window_start_x = 16 * info.thread_id;

959

const int window_step_x = 16 * info.num_threads;

960

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

961

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

962

963

Window win_out(window);

964

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

965

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

966

967

Window win_a(window);

968

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

969

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

970

971

Window win_b;

972

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

973

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

974

if(_input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

979

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

980

981

Iterator ina(_input0, win_a);

982

Iterator inb(_input1, win_b);

983

Iterator out(_output, win_out);

984

985

switch(_input0->info()->data_type())

986

{

987

case DataType::S8:

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

988

case DataType::QASYMM8_SIGNED:

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

989

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

990

vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

break;

}

case DataType::U8:

case DataType::QASYMM8:

995

{

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

996

vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

break;

}

default:

{

ARM_COMPUTE_ERROR("Not supported");

1002

break;

1003

}

1004

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

1005

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

1006

else

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

1007

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

1008

const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];

morgolock

2020-09-29 14:24:32 +0100

[diff] [blame^]

1009

const int width_b = _input1->info()->dimension(0);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

1010

1011

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

1012

Window win_a(window);

1013

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

1014

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));

1015

1016

// Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix

1017

Window win_b;

1018

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

1019

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

1020

if(_slide_matrix_b)

Pablo Tello