Blame - src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-06-28 17:27:56 +0100

[diff] [blame]

57

{

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

58

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

59

const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));

60

const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));

61

const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));

62

63

// The implementation computes 32 elements per iteration

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

64

const int window_start_x = 32 * info.thread_id;

65

const int window_step_x = 32 * info.num_threads;

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

66

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

67

ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");

68

69

Window win_out(window);

70

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

71

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

72

73

Window win_a(window);

74

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

75

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

76

77

Window win_b;

78

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

79

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

80

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

85

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

86

87

Iterator ina(input0, win_a);

88

Iterator inb(input1, win_b);

89

Iterator out(output, win_out);

90

91

const float16x8_t alpha_f16 = vdupq_n_f16(alpha);

92

ARM_COMPUTE_UNUSED(alpha_f16);

93

94

execute_window_loop(win_out, [&](const Coordinates & id)

95

{

96

if(id.x() > width_matrix_b)

{

return;

}

float16x8_t acc0 = vdupq_n_f16(0.f);

102

float16x8_t acc1 = vdupq_n_f16(0.f);

103

float16x8_t acc2 = vdupq_n_f16(0.f);

104

float16x8_t acc3 = vdupq_n_f16(0.f);

105

106

auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());

107

auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr());

108

109

const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;

110

for(; vec_a <= (vec_a_end_addr - 4);)

111

{

112

const float16x4_t a0l = vld1_f16(vec_a);

113

114

float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);

115

float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);

116

float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);

117

float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);

118

float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);

119

float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);

120

float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);

121

float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);

122

123

acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));

124

acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));

125

acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));

126

acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));

127

acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));

128

acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));

129

acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));

130

acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));

131

132

matrix_b += 2 * in_b_stride;

133

134

b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);

135

b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);

136

b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);

137

b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);

138

b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);

139

b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);

140

b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);

141

b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);

142

143

acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));

144

acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));

145

acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));

146

acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));

147

acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));

148

acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));

149

acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));

150

acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));

151

152

vec_a += 4;

153

matrix_b += 2 * in_b_stride;

154

}

155

156

for(; vec_a < vec_a_end_addr;)

157

{

158

const float16_t a0 = *vec_a;

159

const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);

160

const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);

161

const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);

162

const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);

163

164

acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));

165

acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));

166

acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));

167

acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));

168

169

vec_a += 1;

170

matrix_b += in_b_stride;

171

}

172

173

// Multiply by the weight of matrix product (alpha)

174

if(multiply_alpha)

175

{

176

acc0 = vmulq_f16(acc0, alpha_f16);

177

acc1 = vmulq_f16(acc1, alpha_f16);

178

acc2 = vmulq_f16(acc2, alpha_f16);

179

acc3 = vmulq_f16(acc3, alpha_f16);

180

}

181

182

const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());

183

184

vst1q_f16(vec_out + 0, acc0);

185

vst1q_f16(vec_out + 8, acc1);

186

vst1q_f16(vec_out + 16, acc2);

187

vst1q_f16(vec_out + 24, acc3);

188

189

},

190

ina, inb, out);

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

191

#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Georgios Pinitas

30f0215

2017-09-27 11:20:48 +0100

[diff] [blame]

192

ARM_COMPUTE_UNUSED(input0);

193

ARM_COMPUTE_UNUSED(input1);

194

ARM_COMPUTE_UNUSED(output);

195

ARM_COMPUTE_UNUSED(window);

196

ARM_COMPUTE_UNUSED(info);

197

ARM_COMPUTE_UNUSED(alpha);

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

198

ARM_COMPUTE_ERROR("Not implemented");

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

199

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

200

}

201

202

template <bool multiply_alpha>

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

203

void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

204

{

205

const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));

206

const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));

207

const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));

208

209

// The implementation computes 16 elements per iteration

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

210

const int window_start_x = 16 * info.thread_id;

211

const int window_step_x = 16 * info.num_threads;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

212

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

213

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

214

215

Window win_out(window);

216

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

217

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

218

219

Window win_a(window);

220

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

221

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

222

223

Window win_b;

224

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

225

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

226

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

231

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

232

233

Iterator ina(input0, win_a);

234

Iterator inb(input1, win_b);

235

Iterator out(output, win_out);

236

237

execute_window_loop(win_out, [&](const Coordinates & id)

238

{

239

if(id.x() > width_matrix_b)

{

return;

}

float32x4_t acc0 = vdupq_n_f32(0.f);

245

float32x4_t acc1 = vdupq_n_f32(0.f);

246

float32x4_t acc2 = vdupq_n_f32(0.f);

247

float32x4_t acc3 = vdupq_n_f32(0.f);

248

249

auto vec_a = reinterpret_cast<const float *>(ina.ptr());

250

auto matrix_b = reinterpret_cast<const float *>(inb.ptr());

251

252

#if __arm__

253

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));

254

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));

255

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

256

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

257

258

auto vec_a_end_addr = vec_a + num_elems_vec_a;

259

for(; vec_a <= (vec_a_end_addr - 4);)

260

{

261

float32x2_t a0l = vld1_f32(vec_a);

262

263

float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);

264

float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);

265

float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);

266

float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);

267

268

float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);

269

float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);

270

float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);

271

float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);

272

273

#if __arm__

274

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));

275

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));

276

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));

277

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));

278

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

279

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

280

281

acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);

282

acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);

283

acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);

284

acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);

285

286

acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);

287

acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);

288

acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);

289

acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);

290

291

vec_a += 2;

292

matrix_b += 2 * in_b_stride;

293

294

a0l = vld1_f32(vec_a);

295

296

b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);

297

b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);

298

b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);

299

b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);

300

301

b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);

302

b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);

303

b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);

304

b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);

305

306

acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);

307

acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);

308

acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);

309

acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);

310

311

acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);

312

acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);

313

acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);

314

acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);

315

316

vec_a += 2;

317

matrix_b += 2 * in_b_stride;

318

}

319

320

for(; vec_a < vec_a_end_addr;)

321

{

322

const float a0 = *vec_a;

323

324

const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);

325

const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);

326

const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);

327

const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);

328

329

acc0 = vmlaq_n_f32(acc0, b00, a0);

330

acc1 = vmlaq_n_f32(acc1, b01, a0);

331

acc2 = vmlaq_n_f32(acc2, b02, a0);

332

acc3 = vmlaq_n_f32(acc3, b03, a0);

333

334

vec_a += 1;

335

matrix_b += in_b_stride;

336

}

337

338

// Multiply by the weight of matrix product (alpha)

339

if(multiply_alpha)

340

{

341

const float32x4_t alpha_f32 = vdupq_n_f32(alpha);

342

acc0 = vmulq_f32(acc0, alpha_f32);

343

acc1 = vmulq_f32(acc1, alpha_f32);

344

acc2 = vmulq_f32(acc2, alpha_f32);

345

acc3 = vmulq_f32(acc3, alpha_f32);

346

}

347

348

const auto vec_out = reinterpret_cast<float *>(out.ptr());

349

350

vst1q_f32(vec_out + 0, acc0);

351

vst1q_f32(vec_out + 4, acc1);

352

vst1q_f32(vec_out + 8, acc2);

353

vst1q_f32(vec_out + 12, acc3);

},

ina, inb, out);

}

template <bool multiply_alpha>

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

359

void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

360

{

361

const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));

362

const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));

363

const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));

364

const int fixed_point_position = input0->info()->fixed_point_position();

365

366

// The implementation computes 32 elements per iteration

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

367

const int window_start_x = 32 * info.thread_id;

368

const int window_step_x = 32 * info.num_threads;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

369

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

370

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

371

372

Window win_out(window);

373

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

374

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

375

376

Window win_a(window);

377

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

378

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

379

380

Window win_b;

381

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

382

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

383

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

388

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

389

390

Iterator ina(input0, win_a);

391

Iterator inb(input1, win_b);

392

Iterator out(output, win_out);

393

394

execute_window_loop(win_out, [&](const Coordinates & id)

395

{

396

if(id.x() > width_matrix_b)

{

return;

}

// Reset accumulators

402

qint16x8_t acc00_qs16 = vdupq_n_qs16(0);

403

qint16x8_t acc01_qs16 = vdupq_n_qs16(0);

404

qint16x8_t acc02_qs16 = vdupq_n_qs16(0);

405

qint16x8_t acc03_qs16 = vdupq_n_qs16(0);

406

407

auto vec_a = reinterpret_cast<const qint8_t *>(ina.ptr());

408

auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());

409

410

auto vec_a_end_addr = vec_a + num_elems_vec_a;

411

for(; vec_a <= (vec_a_end_addr - 2);)

412

{

413

const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);

414

const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);

415

416

const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);

417

const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);

418

const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);

419

const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);

420

const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);

421

const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);

422

const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);

423

const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);

424

425

// First accumulation

426

acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);

427

acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);

428

acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);

429

acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);

430

431

// Second accumulation

432

acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);

433

acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);

434

acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);

435

acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);

436

437

vec_a += 2;

438

matrix_b += 2 * in_b_stride;

439

}

440

441

for(; vec_a < vec_a_end_addr;)

442

{

443

const qint8x8_t a0 = vld1_dup_qs8(vec_a);

444

445

const qint8x8_t b00 = vld1_qs8(matrix_b + 0);

446

const qint8x8_t b01 = vld1_qs8(matrix_b + 8);

447

const qint8x8_t b02 = vld1_qs8(matrix_b + 16);

448

const qint8x8_t b03 = vld1_qs8(matrix_b + 24);

449

450

acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);

451

acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);

452

acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);

453

acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);

454

455

vec_a += 1;

456

matrix_b += in_b_stride;

457

}

458

459

// Convert back to qint8x8_t and saturate

460

qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);

461

qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);

462

qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);

463

qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);

464

465

// Multiply by the weight of the matrix product (alpha)

466

if(multiply_alpha)

467

{

Georgios Pinitas

2017-07-04 12:47:17 +0100

[diff] [blame]

468

const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

469

acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);

470

acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);

471

acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);

472

acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);

473

}

474

475

const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());

476

477

// Store 8x4 output elements

478

vst1_qs8(mtx_out0 + 0, acc00_qs8);

479

vst1_qs8(mtx_out0 + 8, acc01_qs8);

480

vst1_qs8(mtx_out0 + 16, acc02_qs8);

481

vst1_qs8(mtx_out0 + 24, acc03_qs8);

},

ina, inb, out);

}

template <bool multiply_alpha>

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

487

void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

488

{

489

const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));

490

const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));

491

const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));

492

const int fixed_point_position = input0->info()->fixed_point_position();

493

494

// The implementation computes 16 elements per iteration

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

495

const int window_start_x = 16 * info.thread_id;

496

const int window_step_x = 16 * info.num_threads;

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

497

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

498

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

499

ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");

500

501

Window win_out(window);

502

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

503

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

504

505

Window win_a(window);

506

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

507

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

508

509

Window win_b;

510

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

511

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

512

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

517

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

518

519

Iterator ina(input0, win_a);

520

Iterator inb(input1, win_b);

521

Iterator out(output, win_out);

522

523

execute_window_loop(win_out, [&](const Coordinates & id)

524

{

525

if(id.x() > width_matrix_b)

{

return;

}

// Reset accumulators

531

qint32x4_t acc00_qs32 = vdupq_n_qs32(0);

532

qint32x4_t acc01_qs32 = vdupq_n_qs32(0);

533

qint32x4_t acc02_qs32 = vdupq_n_qs32(0);

534

qint32x4_t acc03_qs32 = vdupq_n_qs32(0);

535

536

auto vec_a = reinterpret_cast<const qint16_t *>(ina.ptr());

537

auto matrix_b = reinterpret_cast<const qint16_t *>(inb.ptr());

538

539

auto vec_a_end_addr = vec_a + num_elems_vec_a;

540

for(; vec_a <= (vec_a_end_addr - 2);)

541

{

542

const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0);

543

const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1);

544

545

const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride);

546

const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride);

547

const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride);

548

const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride);

549

const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride);

550

const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride);

551

const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride);

552

const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride);

553

554

// First accumulation

555

acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);

556

acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);

557

acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);

558

acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);

559

560

// Second accumulation

561

acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position);

562

acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position);

563

acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position);

564

acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position);

565

566

vec_a += 2;

567

matrix_b += 2 * in_b_stride;

568

}

569

570

for(; vec_a < vec_a_end_addr;)

571

{

572

const qint16x4_t a0 = vld1_dup_qs16(vec_a);

573

574

const qint16x4_t b00 = vld1_qs16(matrix_b + 0);

575

const qint16x4_t b01 = vld1_qs16(matrix_b + 4);

576

const qint16x4_t b02 = vld1_qs16(matrix_b + 8);

577

const qint16x4_t b03 = vld1_qs16(matrix_b + 12);

578

579

acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);

580

acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);

581

acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);

582

acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);

583

584

vec_a += 1;

585

matrix_b += in_b_stride;

586

}

587

588

// Convert back to qint16x4_t and saturate

589

qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);

590

qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);

591

qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32);

592

qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32);

593

594

// Multiply by the weight of the matrix product (alpha)

595

if(multiply_alpha)

596

{

Georgios Pinitas

2017-07-04 12:47:17 +0100

[diff] [blame]

597

const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

598

acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);

599

acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);

600

acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);

601

acc03_qs16 = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position);

602

}

603

604

const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());

605

606

// Store 16x4 output elements

607

vst1_qs16(mtx_out0 + 0, acc00_qs16);

608

vst1_qs16(mtx_out0 + 4, acc01_qs16);

609

vst1_qs16(mtx_out0 + 8, acc02_qs16);

610

vst1_qs16(mtx_out0 + 12, acc03_qs16);

},

ina, inb, out);

}

template <bool multiply_alpha>

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

616

void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)

617

{

618

const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());

619

const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());

620

const size_t out_stride2 = out_stride1 * 2;

621

const size_t out_stride3 = out_stride1 * 3;

622

const int num_elems_matrix_b_x = input1->info()->dimension(0);

623

624

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

625

Window win_a(window);

626

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

627

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

628

629

Window win_b;

630

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

631

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

632

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

// Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix

637

// The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4

638

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));

639

win_b.set(Window::DimY, Window::Dimension(0, 0, 0));

640

641

Iterator ina(input0, win_a);

642

Iterator inb(input1, win_b);

643

Iterator out(output, window);

644

645

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW

646

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration

647

// All the values needed for computing a single 4x4 block will be read from consecutive memory positions

648

execute_window_loop(window, [&](const Coordinates & id)

649

{

650

auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());

651

auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());

652

auto mtx_b1 = mtx_b0 + in_b_stride;

653

654

float32x4_t acc00 = vdupq_n_f32(0.f);

655

float32x4_t acc10 = vdupq_n_f32(0.f);

656

float32x4_t acc20 = vdupq_n_f32(0.f);

657

float32x4_t acc30 = vdupq_n_f32(0.f);

658

659

float32x4_t acc01 = vdupq_n_f32(0.f);

660

float32x4_t acc11 = vdupq_n_f32(0.f);

661

float32x4_t acc21 = vdupq_n_f32(0.f);

662

float32x4_t acc31 = vdupq_n_f32(0.f);

663

664

#if __arm__

665

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

666

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

667

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

668

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

669

670

auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;

671

for(; mtx_b0 <= (mtx_b0_end_addr - 32);)

672

{

673

float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);

674

float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);

675

float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);

676

float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);

677

678

float32x4_t b00 = vld1q_f32(mtx_b0);

679

float32x4_t b10 = vld1q_f32(mtx_b1);

680

float32x4_t b01 = vld1q_f32(mtx_b0 + 4);

681

float32x4_t b11 = vld1q_f32(mtx_b1 + 4);

682

683

#if __arm__

684

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

685

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

686

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

687

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

688

689

// 4x4 block 0

690

acc00 = vmlaq_f32(acc00, b00, a0);

691

acc10 = vmlaq_f32(acc10, b00, a1);

692

acc20 = vmlaq_f32(acc20, b00, a2);

693

acc30 = vmlaq_f32(acc30, b00, a3);

694

695

float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);

696

float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);

697

float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);

698

float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);

699

700

// 4x4 block 1

701

acc01 = vmlaq_f32(acc01, b10, a0);

702

acc11 = vmlaq_f32(acc11, b10, a1);

703

acc21 = vmlaq_f32(acc21, b10, a2);

704

acc31 = vmlaq_f32(acc31, b10, a3);

705

706

// 4x4 block 0

707

acc00 = vmlaq_f32(acc00, b01, a4);

708

acc10 = vmlaq_f32(acc10, b01, a5);

709

acc20 = vmlaq_f32(acc20, b01, a6);

710

acc30 = vmlaq_f32(acc30, b01, a7);

711

712

// 4x4 block 1

713

acc01 = vmlaq_f32(acc01, b11, a4);

714

acc11 = vmlaq_f32(acc11, b11, a5);

715

acc21 = vmlaq_f32(acc21, b11, a6);

716

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

723

a1 = vld1q_dup_f32(mtx_a0 + 1);

724

a2 = vld1q_dup_f32(mtx_a0 + 2);

725

a3 = vld1q_dup_f32(mtx_a0 + 3);

726

727

b00 = vld1q_f32(mtx_b0);

728

b10 = vld1q_f32(mtx_b1);

729

b01 = vld1q_f32(mtx_b0 + 4);

730

b11 = vld1q_f32(mtx_b1 + 4);

731

732

// 4x4 block 0

733

acc00 = vmlaq_f32(acc00, b00, a0);

734

acc10 = vmlaq_f32(acc10, b00, a1);

735

acc20 = vmlaq_f32(acc20, b00, a2);

736

acc30 = vmlaq_f32(acc30, b00, a3);

737

738

a4 = vld1q_dup_f32(mtx_a0 + 4);

739

a5 = vld1q_dup_f32(mtx_a0 + 5);

740

a6 = vld1q_dup_f32(mtx_a0 + 6);

741

a7 = vld1q_dup_f32(mtx_a0 + 7);

742

743

// 4x4 block 1

744

acc01 = vmlaq_f32(acc01, b10, a0);

745

acc11 = vmlaq_f32(acc11, b10, a1);

746

acc21 = vmlaq_f32(acc21, b10, a2);

747

acc31 = vmlaq_f32(acc31, b10, a3);

748

749

// 4x4 block 0

750

acc00 = vmlaq_f32(acc00, b01, a4);

751

acc10 = vmlaq_f32(acc10, b01, a5);

752

acc20 = vmlaq_f32(acc20, b01, a6);

753

acc30 = vmlaq_f32(acc30, b01, a7);

754

755

// 4x4 block 1

756

acc01 = vmlaq_f32(acc01, b11, a4);

757

acc11 = vmlaq_f32(acc11, b11, a5);

758

acc21 = vmlaq_f32(acc21, b11, a6);

759

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

766

a1 = vld1q_dup_f32(mtx_a0 + 1);

767

a2 = vld1q_dup_f32(mtx_a0 + 2);

768

a3 = vld1q_dup_f32(mtx_a0 + 3);

769

b00 = vld1q_f32(mtx_b0);

770

b10 = vld1q_f32(mtx_b1);

771

b01 = vld1q_f32(mtx_b0 + 4);

772

b11 = vld1q_f32(mtx_b1 + 4);

773

774

#if __arm__

775

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

776

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

777

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

778

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

779

780

// 4x4 block 0

781

acc00 = vmlaq_f32(acc00, b00, a0);

782

acc10 = vmlaq_f32(acc10, b00, a1);

783

acc20 = vmlaq_f32(acc20, b00, a2);

784

acc30 = vmlaq_f32(acc30, b00, a3);

785

786

a4 = vld1q_dup_f32(mtx_a0 + 4);

787

a5 = vld1q_dup_f32(mtx_a0 + 5);

788

a6 = vld1q_dup_f32(mtx_a0 + 6);

789

a7 = vld1q_dup_f32(mtx_a0 + 7);

790

791

// 4x4 block 1

792

acc01 = vmlaq_f32(acc01, b10, a0);

793

acc11 = vmlaq_f32(acc11, b10, a1);

794

acc21 = vmlaq_f32(acc21, b10, a2);

795

acc31 = vmlaq_f32(acc31, b10, a3);

796

797

// 4x4 block 0

798

acc00 = vmlaq_f32(acc00, b01, a4);

799

acc10 = vmlaq_f32(acc10, b01, a5);

800

acc20 = vmlaq_f32(acc20, b01, a6);

801

acc30 = vmlaq_f32(acc30, b01, a7);

802

803

// 4x4 block 1

804

acc01 = vmlaq_f32(acc01, b11, a4);

805

acc11 = vmlaq_f32(acc11, b11, a5);

806

acc21 = vmlaq_f32(acc21, b11, a6);

807

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

814

a1 = vld1q_dup_f32(mtx_a0 + 1);

815

a2 = vld1q_dup_f32(mtx_a0 + 2);

816

a3 = vld1q_dup_f32(mtx_a0 + 3);

817

b00 = vld1q_f32(mtx_b0);

818

b10 = vld1q_f32(mtx_b1);

819

b01 = vld1q_f32(mtx_b0 + 4);

820

b11 = vld1q_f32(mtx_b1 + 4);

821

822

// 4x4 block 0

823

acc00 = vmlaq_f32(acc00, b00, a0);

824

acc10 = vmlaq_f32(acc10, b00, a1);

825

acc20 = vmlaq_f32(acc20, b00, a2);

826

acc30 = vmlaq_f32(acc30, b00, a3);

827

828

a4 = vld1q_dup_f32(mtx_a0 + 4);

829

a5 = vld1q_dup_f32(mtx_a0 + 5);

830

a6 = vld1q_dup_f32(mtx_a0 + 6);

831

a7 = vld1q_dup_f32(mtx_a0 + 7);

832

833

// 4x4 block 1

834

acc01 = vmlaq_f32(acc01, b10, a0);

835

acc11 = vmlaq_f32(acc11, b10, a1);

836

acc21 = vmlaq_f32(acc21, b10, a2);

837

acc31 = vmlaq_f32(acc31, b10, a3);

838

839

// 4x4 block 0

840

acc00 = vmlaq_f32(acc00, b01, a4);

841

acc10 = vmlaq_f32(acc10, b01, a5);

842

acc20 = vmlaq_f32(acc20, b01, a6);

843

acc30 = vmlaq_f32(acc30, b01, a7);

844

845

// 4x4 block 1

846

acc01 = vmlaq_f32(acc01, b11, a4);

847

acc11 = vmlaq_f32(acc11, b11, a5);

848

acc21 = vmlaq_f32(acc21, b11, a6);

849

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

}

for(; mtx_b0 < mtx_b0_end_addr;)

857

{

858

float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);

859

float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);

860

float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);

861

float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);

862

float32x4_t b00 = vld1q_f32(mtx_b0);

863

float32x4_t b10 = vld1q_f32(mtx_b1);

864

865

#if __arm__

866

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

867

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

868

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

869

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

870

// 4x4 block 0

871

acc00 = vmlaq_f32(acc00, b00, a0);

872

acc10 = vmlaq_f32(acc10, b00, a1);

873

acc20 = vmlaq_f32(acc20, b00, a2);

874

acc30 = vmlaq_f32(acc30, b00, a3);

875

876

// 4x4 block 1

877

acc01 = vmlaq_f32(acc01, b10, a0);

878

acc11 = vmlaq_f32(acc11, b10, a1);

879

acc21 = vmlaq_f32(acc21, b10, a2);

880

acc31 = vmlaq_f32(acc31, b10, a3);

mtx_a0 += 4;

mtx_b0 += 4;

mtx_b1 += 4;

}

// Multiply by the weight of matrix product (alpha)

888

if(multiply_alpha)

889

{

890

const float32x4_t alpha_f32 = vdupq_n_f32(alpha);

891

acc00 = vmulq_f32(acc00, alpha_f32);

892

acc10 = vmulq_f32(acc10, alpha_f32);

893

acc20 = vmulq_f32(acc20, alpha_f32);

894

acc30 = vmulq_f32(acc30, alpha_f32);

895

acc01 = vmulq_f32(acc01, alpha_f32);

896

acc11 = vmulq_f32(acc11, alpha_f32);

897

acc21 = vmulq_f32(acc21, alpha_f32);

898

acc31 = vmulq_f32(acc31, alpha_f32);

899

}

900

901

const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());

902

const auto mtx_out1 = mtx_out0 + 4;

903

904

// Store the 4 blocks

905

vst1q_f32(mtx_out0, acc00);

906

vst1q_f32(mtx_out1, acc01);

907

vst1q_f32(mtx_out0 + out_stride1, acc10);

908

vst1q_f32(mtx_out1 + out_stride1, acc11);

909

vst1q_f32(mtx_out0 + out_stride2, acc20);

910

vst1q_f32(mtx_out1 + out_stride2, acc21);

911

vst1q_f32(mtx_out0 + out_stride3, acc30);

912

vst1q_f32(mtx_out1 + out_stride3, acc31);

},

ina, inb, out);

}

template <bool multiply_alpha>

918

void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)

919

{

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

920

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

921

const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());

922

const size_t out_stride = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());

923

const int num_elems_matrix_b_x = input1->info()->dimension(0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

924

925

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

926

Window win_a(window);

927

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

928

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

929

930

Window win_b;

931

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

932

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

933

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

// Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the output matrix

938

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));

939

win_b.set(Window::DimY, Window::Dimension(0, 1, 0));

940

941

Iterator ina(input0, win_a);

942

Iterator inb(input1, win_b);

943

Iterator out(output, window);

944

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

945

const float16x8_t alpha_f16 = vdupq_n_f16(alpha);

946

947

execute_window_loop(window, [&](const Coordinates & id)

948

{

949

const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr());

950

const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr());

951

auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr());

float16x8x4_t c =

{

{

vdupq_n_f16(0.f),

vdupq_n_f16(0.f),

vdupq_n_f16(0.f),

vdupq_n_f16(0.f)

}

};

/*

This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)

964

|a00 a01 a02 a03 | a04 a05 a06 a07|

965

|a10 a11 a12 a13 | a14 a15 a16 a17|

966

|a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...

967

|a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...

968

|a40 a41 a42 a43 | a44 a45 a46 a47|

969

|a50 a51 a52 a53 | a54 a55 a56 a57|

970

|a60 a61 a62 a63 | a64 a65 a66 a67|

971

|a70 a71 a72 a73 | a74 a75 a76 a77|

972

973

After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]

974

975

B Matrix has been transposed as shown below

976

977

|b00 b01 b02 b03 b04 b05 b06 b07|

978

|b10 b11 b12 b13 b14 b15 b16 b17|

979

|b20 b21 b22 b23 b24 b25 b26 b27|

980

|b30 b31 b32 b33 b34 b35 b36 b37|

981

------------------->

982

983

|b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|

984

985

c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30

986

c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31

987

988

The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.

989

*/

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

990

const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;

991

992

for(; mtx_b0 <= (mtx_b0_end_addr - 32);)

993

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

994

{

995

const float16x8_t p00 = vld1q_f16(mtx_a0);

996

const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

997

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

998

const float16x8_t q00 = vld1q_f16(mtx_b0);

999

const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);

1000

const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);

1001

const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);

1002

1003

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));

1004

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));

1005

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));

1006

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));

1007

1008

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));

1009

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));

1010

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));

1011

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));

1012

1013

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));

1014

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));

1015

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));

1016

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));

1017

1018

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));

1019

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));

1020

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));

1021

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

mtx_a0 += 16;

mtx_b0 += 32;

}

for(; mtx_b0 < mtx_b0_end_addr;)

1028

1029

{

1030

const float16x4_t p00 = vld1_f16(mtx_a0);

1031

const float16x8_t q00 = vld1q_f16(mtx_b0);

1032

1033

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));

1034

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));

1035

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));

1036

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));

1037

1038

mtx_a0 += 4;

1039

mtx_b0 += 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

if(multiply_alpha)

{

c.val[0] = vmulq_f16(c.val[0], alpha_f16);

1045

c.val[1] = vmulq_f16(c.val[1], alpha_f16);

1046

c.val[2] = vmulq_f16(c.val[2], alpha_f16);

1047

c.val[3] = vmulq_f16(c.val[3], alpha_f16);

1048

}

1049

1050

vst1q_f16(mtx_out + 0 * out_stride, c.val[0]);

1051

vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);

1052

vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);

1053

vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);

1054

},

1055

ina, inb, out);

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1056

#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Georgios Pinitas

30f0215

2017-09-27 11:20:48 +0100

[diff] [blame]

1057

ARM_COMPUTE_UNUSED(input0);

1058

ARM_COMPUTE_UNUSED(input1);

1059

ARM_COMPUTE_UNUSED(output);

1060

ARM_COMPUTE_UNUSED(window);

1061

ARM_COMPUTE_UNUSED(alpha);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1062

ARM_COMPUTE_ERROR("Not implemented");

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1063

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1064

}

1065

1066

template <bool multiply_alpha>

1067

void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)

1068

{

1069

const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());

1070

const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());

1071

const size_t out_stride2 = out_stride1 * 2;

1072

const size_t out_stride3 = out_stride1 * 3;

1073

const int num_elems_matrix_b_x = input1->info()->dimension(0);

1074

const int fixed_point_position = input0->info()->fixed_point_position();

Georgios Pinitas

2017-07-04 12:47:17 +0100

[diff] [blame]

1075

const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1076

ARM_COMPUTE_UNUSED(alpha_qs8);

1077

1078

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

1079

Window win_a(window);

1080

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

1081

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

1082

1083

Window win_b;

1084

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

1085

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

1086

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

// Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix

1091

// The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4

1092

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));

1093

win_b.set(Window::DimY, Window::Dimension(0, 0, 0));

1094

1095

Iterator ina(input0, win_a);

1096

Iterator inb(input1, win_b);

1097

Iterator out(output, window);

1098

1099

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW

1100

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration

1101

// All the values needed for computing a single 32x4 block will be read from consecutive memory positions

1102

execute_window_loop(window, [&](const Coordinates & id)

1103

{

1104

auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());

1105

auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());

1106

auto mtx_b1 = mtx_b0 + in_b_stride;

1107

1108

qint16x8_t acc00_qs16 = vdupq_n_qs16(0);

1109

qint16x8_t acc10_qs16 = vdupq_n_qs16(0);

1110

qint16x8_t acc20_qs16 = vdupq_n_qs16(0);

1111

qint16x8_t acc30_qs16 = vdupq_n_qs16(0);

1112

1113

qint16x8_t acc01_qs16 = vdupq_n_qs16(0);

1114

qint16x8_t acc11_qs16 = vdupq_n_qs16(0);

1115

qint16x8_t acc21_qs16 = vdupq_n_qs16(0);

1116

qint16x8_t acc31_qs16 = vdupq_n_qs16(0);

1117

1118

qint16x8_t acc02_qs16 = vdupq_n_qs16(0);

1119

qint16x8_t acc12_qs16 = vdupq_n_qs16(0);

1120

qint16x8_t acc22_qs16 = vdupq_n_qs16(0);

1121

qint16x8_t acc32_qs16 = vdupq_n_qs16(0);

1122

1123

qint16x8_t acc03_qs16 = vdupq_n_qs16(0);

1124

qint16x8_t acc13_qs16 = vdupq_n_qs16(0);

1125

qint16x8_t acc23_qs16 = vdupq_n_qs16(0);

1126

qint16x8_t acc33_qs16 = vdupq_n_qs16(0);

1127

1128

int k = 0;

1129

// This for loop performs 2 accumulations

1130

for(; k <= (num_elems_matrix_b_x - 32); k += 32)

1131

{

1132

const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);

1133

const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);

1134

const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);

1135

const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);

1136

const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);

1137

const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);

1138

const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);

1139

const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);

1140

1141

const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);

1142

const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);

1143

const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);

1144

const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);

1145

1146

// First accumulation

1147

acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);

1148

acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);

1149

acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);

1150

acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);

1151

acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);

1152

acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);

1153

acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);

1154

acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);

1155

1156

const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);

1157

const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);

1158

const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);

1159

const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);

1160

1161

acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);

1162

acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);

1163

acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);

1164

acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);

1165

acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);

1166

acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);

1167

acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);

1168

acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);

1169

1170

#if __arm__

1171

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

1172

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

1173

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

1174

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1175

1176

// Second accumulation

1177

acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);

1178

acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);

1179

acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);

1180

acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);

1181

acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);

1182

acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);

1183

acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);

1184

acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);

1185

acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);

1186

acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);

1187

acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);

1188

acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);

1189

acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);

1190

acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);

1191

acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);

1192

acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);

mtx_a0 += 8;

mtx_b0 += 32;

mtx_b1 += 32;

}

// This for loop performs the left over accumulations

1200

for(; k < num_elems_matrix_b_x; k += 16)

1201

{

1202

const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);

1203

const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);

1204

const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);

1205

const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);

1206

1207

const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);

1208

const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);

1209

const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);

1210

const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);

1211

1212

acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);

1213

acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);

1214

acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);

1215

acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);

1216

acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);

1217

acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);

1218

acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);

1219

acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);

1220

acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);

1221

acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);

1222

acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);

1223

acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);

1224

acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);

1225

acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);

1226

acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);

1227

acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);

mtx_a0 += 4;

mtx_b0 += 16;

mtx_b1 += 16;

}

// Convert back to qint8x8_t and saturate

1235

qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);

1236

qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);

1237

qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);

1238

qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);

1239

1240

qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);

1241

qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);

1242

qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);

1243

qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);

1244

1245

qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);

1246

qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);

1247

qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);

1248

qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);

1249

1250

qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);

1251

qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);

1252

qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);

1253

qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);

1254

1255

// Multiply by the weight of the matrix product (alpha)

1256

if(multiply_alpha)

1257

{

1258

acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);

1259

acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);

1260

acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);

1261

acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);

1262

acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);

1263

acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);

1264

acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);

1265

acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);

1266

acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);

1267

acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);

1268

acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);

1269

acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);

1270

acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);

1271

acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);

1272

acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);

1273

acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);

1274

}

1275

1276

const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());

1277

1278

// Store 32x4 output elements

1279

vst1_qs8(mtx_out0 + 0, acc00_qs8);

1280

vst1_qs8(mtx_out0 + 8, acc01_qs8);

1281

vst1_qs8(mtx_out0 + 16, acc02_qs8);

1282

vst1_qs8(mtx_out0 + 24, acc03_qs8);

1283

vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);

1284

vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);

1285

vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);

1286

vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);

1287

vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);

1288

vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);

1289

vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);

1290

vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);

1291

vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);

1292

vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);

1293

vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);

1294

vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);

},

ina, inb, out);

}

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1299

template <bool multiply_alpha>

1300

void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)

1301

{

1302

const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());

1303

const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());

1304

const size_t out_stride2 = out_stride1 * 2;

1305

const size_t out_stride3 = out_stride1 * 3;

1306

const int num_elems_matrix_b_x = input1->info()->dimension(0);

1307

const int fixed_point_position = input0->info()->fixed_point_position();

Georgios Pinitas

2017-07-04 12:47:17 +0100

[diff] [blame]

1308

const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1309

ARM_COMPUTE_UNUSED(alpha_qs16);

1310

1311

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

1312

Window win_a(window);

1313

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

1314

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

1315

1316

Window win_b;

1317

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

1318

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

1319

if(input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

// Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix

1324

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));

1325

win_b.set(Window::DimY, Window::Dimension(0, 0, 0));

1326

1327

Iterator ina(input0, win_a);

1328

Iterator inb(input1, win_b);

1329

Iterator out(output, window);

1330

1331

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW

1332

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration

1333

// All the values needed for computing a single 8x4 block will be read from consecutive memory positions

1334

execute_window_loop(window, [&](const Coordinates & id)

1335

{

1336

auto mtx_a0 = reinterpret_cast<const qint16_t *>(ina.ptr());

1337

auto mtx_b0 = reinterpret_cast<const qint16_t *>(inb.ptr());

1338

auto mtx_b1 = mtx_b0 + in_b_stride;

1339

1340

qint32x4_t acc00_qs32 = vdupq_n_qs32(0);

1341

qint32x4_t acc10_qs32 = vdupq_n_qs32(0);

1342

qint32x4_t acc20_qs32 = vdupq_n_qs32(0);

1343

qint32x4_t acc30_qs32 = vdupq_n_qs32(0);

1344

1345

qint32x4_t acc01_qs32 = vdupq_n_qs32(0);

1346

qint32x4_t acc11_qs32 = vdupq_n_qs32(0);

1347

qint32x4_t acc21_qs32 = vdupq_n_qs32(0);

1348

qint32x4_t acc31_qs32 = vdupq_n_qs32(0);

1349

1350

// This for loop performs 1 accumulation

1351

for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8)

1352

{

1353

const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0);

1354

const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1);

1355

const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2);

1356

const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3);

1357

1358

const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0);

1359

const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4);

1360

1361

acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);

1362

acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position);

1363

acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position);

1364

acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position);

1365

acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);

1366

acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position);

1367

acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position);

1368

acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position);

mtx_a0 += 4;

mtx_b0 += 8;

mtx_b1 += 8;

}

// Convert back to qint16x4_t and saturate

1376

qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);

1377

qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32);

1378

qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32);

1379

qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32);

1380

1381

qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);

1382

qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32);

1383

qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32);

1384

qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32);

1385

1386

// Multiply by the weight of the matrix product (alpha)

1387

if(multiply_alpha)

1388

{

1389

acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);

1390

acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position);

1391

acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position);

1392

acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position);

1393

acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);

1394

acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position);

1395

acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position);

1396

acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position);

1397

}

1398

1399

const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());

1400

1401

// Store 8x4 output elements

1402

vst1_qs16(mtx_out0 + 0, acc00_qs16);

1403

vst1_qs16(mtx_out0 + 4, acc01_qs16);

1404

vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16);

1405

vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16);

1406

vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16);

1407

vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16);

1408

vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16);

1409

vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16);

1410

},

1411

ina, inb, out);

1412

}

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1413

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1414

inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1415

{

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1416

ARM_COMPUTE_UNUSED(alpha);

1417

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1418

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);

1419

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);

1420

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1421

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1422

if(!is_interleaved)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1423

{

1424

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1425

1426

if(output->total_size() != 0)

1427

{

1428

ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));

1429

ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));

1430

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

1431

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);

}

}

else

{

const int m = reshape_info.m();

1437

const int n = reshape_info.n();

1438

const int k = reshape_info.k();

1439

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

1440

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

1441

1442

/* Interleave */

1443

TensorShape tensor_shape0{ input0->tensor_shape() };

1444

tensor_shape0.set(0, k);

1445

tensor_shape0.set(1, m);

1446

1447

const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);

1448

const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));

1449

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);

1450

1451

if(n != 0) /* Transpose */

1452

{

1453

TensorShape tensor_shape1{ input1->tensor_shape() };

1454

tensor_shape1.set(0, n);

1455

tensor_shape1.set(1, k);

1456

1457

const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);

1458

const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));

1459

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

1460

}

1461

1462

if(output->total_size() != 0)

{

if(n != 0)

{

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));

1467

}

1468

ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));

1469

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);

1470

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);

1471

}

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

}

return Status{};

}

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1477

inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1478

{

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1479

bool window_changed{};

1480

Window win{};

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1481

1482

unsigned int num_elems_processed_per_iteration_x = 0;

1483

const unsigned int num_elems_processed_per_iteration_y = 4;

1484

1485

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

1486

if((output->dimension(1) == 1))

1487

{

1488

switch(input0->data_type())

{

case DataType::F32:

{

num_elems_processed_per_iteration_x = 16;

break;

}

case DataType::QS8:

{

num_elems_processed_per_iteration_x = 32;

break;

}

case DataType::QS16:

{

num_elems_processed_per_iteration_x = 16;

1503

break;

1504

}

1505

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1506

case DataType::F16:

1507

{

1508

num_elems_processed_per_iteration_x = 32;

1509

break;

1510

}

1511

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1512

default:

1513

{

1514

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

}

// Configure kernel window

1520

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));

1521

1522

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);

1523

1524

window_changed = update_window_and_padding(win,

1525

AccessWindowStatic(input0, 0, 0, input0->tensor_shape().x(), 1),

1526

AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration_x),

output_access);

Coordinates coord;

coord.set_num_dimensions(output->num_dimensions());

1531

output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));

}

else

{

switch(input0->data_type())

{

case DataType::F32:

{

num_elems_processed_per_iteration_x = 8;

break;

}

case DataType::QS8:

{

num_elems_processed_per_iteration_x = 32;

break;

}

case DataType::QS16:

{

num_elems_processed_per_iteration_x = 8;

1550

break;

1551

}

1552

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1553

case DataType::F16:

1554

{

1555

num_elems_processed_per_iteration_x = 8;

1556

break;

1557

}

1558

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1559

default:

1560

{

1561

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

}

// Configure kernel window

1567

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

1568

1569

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

1570

1571

window_changed = update_window_and_padding(win,

1572

AccessWindowRectangle(input0, 0, 0, 4, 1, 1.f, 0.25f),

1573

AccessWindowStatic(input1, 0, 0, input1->tensor_shape().x(), ceil_to_multiple(input1->tensor_shape().y(), 4)),

1574

output_access);

1575

1576

output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));

1577

}

1578

1579

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

1580

return std::make_pair(err, win);

1581

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1582

} // namespace

1583

1584

NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()

1585

: _input0(nullptr), _input1(nullptr), _output(nullptr), _alpha(1.0f)

{

}

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1589

void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1590

{

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1591

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1592

1593

// Output tensor auto inizialitation if not yet initialized

1594

TensorShape tensor_shape{ input0->info()->tensor_shape() };

1595

tensor_shape.set(0, is_interleaved ? reshape_info.n() : input1->info()->dimension(0));

1596

tensor_shape.set(1, is_interleaved ? reshape_info.m() : input0->info()->dimension(1));

1597

1598

auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));

1599

1600

// Perform validate step

1601

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, is_interleaved, reshape_info));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

_input0 = input0;

_input1 = input1;

_output = output;

_alpha = alpha;

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1608

// Configure kernel window

1609

auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());

1610

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

1611

INEKernel::configure(win_config.second);

1612

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1613

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1614

Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved,

1615

const GEMMReshapeInfo &reshape_info)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1616

{

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame^]

1617

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, is_interleaved, reshape_info));

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1618

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1619

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1620

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1621

}

1622

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

1623

void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1624

{

1625

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1626

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1627

1628

bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;

1629

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1630

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1631

if((_output->info()->dimension(1) == 1))

1632

{

1633

switch(_input0->info()->data_type())

1634

{

1635

case DataType::F32:

1636

{

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

1637

multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, info, _alpha) :

1638

vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

break;

}

case DataType::QS8:

{

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

1643

multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, info, _alpha) :

1644

vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, info, _alpha);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1645

break;

1646

}

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1647

case DataType::QS16:

1648

{

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

1649

multiply_alpha ? vector_matrix_multiply_qs16<true>(_input0, _input1, _output, window, info, _alpha) :

1650

vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1651

break;

1652

}

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1653

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

1654

case DataType::F16:

1655

{

Moritz Pflanzer

2017-09-07 09:48:04 +0100

[diff] [blame]

1656

multiply_alpha ? vector_matrix_multiply_f16<true>(_input0, _input1, _output, window, info, _alpha) :

1657

vector_matrix_multiply_f16<false>(_input0, _input1, _output, window, info, _alpha);

Pablo Tello

2017-06-28 17:27:56 +0100

[diff] [blame]

1658

break;

1659

}

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1660

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1661

default:

1662

{

1663

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

}

}

else

{

switch(_input0->info()->data_type())

{

case DataType::F32:

{

multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :

1675

matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);

break;

}

case DataType::QS8:

{

multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :

1681

matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);

1682

break;

1683

}

Gian Marco Iodice

2017-06-30 12:21:00 +0100

[diff] [blame]

1684

case DataType::QS16:

1685

{

1686

multiply_alpha ? matrix_matrix_multiply_qs16<true>(_input0, _input1, _output, window, _alpha) :

1687

matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);

1688

break;

1689

}

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1690

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1691

case DataType::F16:

1692

{

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1693

multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :

1694

matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);

1695

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1696

}

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1697

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Anthony Barbier