Blame - src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp - ml/ComputeLibrary

2017-11-16 19:24:39 +0000

[diff] [blame]

123

{

124

ARM_COMPUTE_UNUSED(num_mtx_a_cols);

125

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));

126

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

127

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

128

return Status{};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

129

}

130

131

void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)

132

{

133

ARM_COMPUTE_UNUSED(info);

134

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

135

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

136

137

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

138

139

Window win_input(collapsed_window);

140

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

141

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

142

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

143

144

Iterator in(_input, win_input);

145

Iterator out(_output, collapsed_window);

if(_is_reshaped)

{

execute_window_loop(collapsed_window, [&](const Coordinates & id)

150

{

151

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

152

uint32x4_t sum_row = vdupq_n_u32(0);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

153

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

154

const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

155

156

#if __arm__

157

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

162

for(; i <= (_k - 4); i += 4)

163

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

164

const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

165

166

// Convert U8 to U16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

167

uint16x4x4_t a0_u16 =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

168

{

169

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

170

vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),

171

vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),

172

vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),

173

vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

178

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);

179

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);

180

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

181

182

// Accumulate to U32

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

183

sum_row = vaddw_u16(sum_row, a0_u16.val[0]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

184

}

185

186

// This for loop performs the leftover accumulations

187

for(; i < _k; ++i)

188

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

189

const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

190

191

// Convert U8 to U16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

192

const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

193

194

// Accumulate to U32

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

195

sum_row = vaddw_u16(sum_row, a0_u16);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

196

}

197

198

auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());

199

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

200

vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

205

{

206

execute_window_loop(collapsed_window, [&](const Coordinates & id)

207

{

208

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

209

uint32x4_t sum_row_u32 = vdupq_n_u32(0);

210

uint32_t sum_row = 0;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

211

Gian Marco

05288a2

2017-11-21 10:57:50 +0000

[diff] [blame]

212

const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

213

214

#if __arm__

215

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 16 accumulations

220

for(; i <= (_k - 16); i += 16)

221

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

222

const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

223

224

// Partial accumulations in U16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

225

const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

226

227

// Accumulate to U32

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

228

sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

229

}

230

231

// This for loop performs the leftover accumulations

232

for(; i < _k; ++i)

233

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

234

sum_row += static_cast<uint32_t>(matrix_a[i]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

235

}

236

237

#if defined(__aarch64__)

238

// Reduction operation available on 64 bit architectures only

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

239

sum_row += vaddvq_u32(sum_row_u32);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

240

#else // __aarch64__

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

241

uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));

242

tmp = vpadd_u32(tmp, tmp);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

243

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

244

sum_row += vget_lane_u32(tmp, 0);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

245

#endif // __aarch64__

246

247

*(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);

},

in, out);

}

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

253

void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

254

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

255

ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);

256

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

257

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

258

_input = mtx_b;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

259

_output = vector_sum_col;

260

_k = num_mtx_b_rows;

261

_is_reshaped = is_transposed1xW;

262

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

263

// Configure kernel window

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

264

auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());

265

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

266

INEKernel::configure(win_config.second);

267

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

268

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

269

Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

270

{

271

ARM_COMPUTE_UNUSED(num_mtx_b_rows);

272

ARM_COMPUTE_UNUSED(is_transposed1xW);

273

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));

274

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

275

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

276

return Status{};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

277

}

278

279

void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)

280

{

281

ARM_COMPUTE_UNUSED(info);

282

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

283

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

284

285

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

if(_is_reshaped)

{

Window win_input(collapsed_window);

290

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

291

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

292

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

293

294

Iterator in(_input, win_input);

295

Iterator out(_output, collapsed_window);

296

297

execute_window_loop(collapsed_window, [&](const Coordinates & id)

298

{

299

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

300

uint32x4x4_t sum_col =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

301

{

302

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

310

const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

311

312

#if __arm__

313

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

#endif /* __arm__ */

int i = 0;

for(; i < _k; ++i)

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

319

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

320

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

321

// Convert S8 to U16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

322

const uint16x8x2_t b0_u16 =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

323

{

324

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

325

vmovl_u8(vget_low_u8(b0_u8)),

326

vmovl_u8(vget_high_u8(b0_u8))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U32

sum_col =

{

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

334

vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),

335

vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),

336

vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),

337

vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

}

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

343

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

344

vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));

345

vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));

346

vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));

347

vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

352

{

353

const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));

354

const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);

355

356

// The implementation computes 16 elements per iteration

357

const int window_start_x = 16 * info.thread_id;

358

const int window_step_x = 16 * info.num_threads;

359

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

360

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

361

362

Window win_out(collapsed_window);

363

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

364

365

Window win_in(win_out);

366

win_in.set(Window::DimY, Window::Dimension(0, 0, 0));

367

win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

368

369

Iterator inb(_input, win_in);

370

Iterator out(_output, win_out);

371

372

execute_window_loop(win_out, [&](const Coordinates & id)

373

{

374

if(id.x() > width_matrix_b)

{

return;

}

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

380

uint32x4x4_t sum_col =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

381

{

382

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

390

const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

391

392

#if __arm__

393

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

394

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

399

for(; i <= (_k - 4); i += 4)

400

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

401

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);

402

const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);

403

const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);

404

const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

405

406

#if __arm__

407

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));

408

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));

409

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));

410

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));

411

#endif /* __arm__ */

412

413

// Partial accumulation in u16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

414

uint16x8x2_t tmp_sum =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

415

{

416

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

417

vdupq_n_u16(0),

418

vdupq_n_u16(0)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

422

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));

423

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));

424

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));

425

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));

426

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));

427

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));

428

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));

429

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

// Accumulate to U32

sum_col =

{

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

435

vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),

436

vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),

437

vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),

438

vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

matrix_b += 4 * in_b_stride;

443

}

444

445

// This for loop perfoms the leftover accumulations

446

for(; i < _k; ++i)

447

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

448

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

449

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

450

// Convert S8 to S16

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

451

const uint16x8x2_t b0_u16 =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

452

{

453

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

454

vmovl_u8(vget_low_u8(b0_u8)),

455

vmovl_u8(vget_high_u8(b0_u8))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U32

sum_col =

{

{

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

463

vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),

464

vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),

465

vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),

466

vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

matrix_b += in_b_stride;

471

}

472

473

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

474

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

475

vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));

476

vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));

477

vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));

478

vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));

Gian Marco Iodice