Blame - src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp - ml/ComputeLibrary

2017-12-06 11:53:03 +0000

[diff] [blame]

55

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

56

}

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

57

std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

58

{

59

const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;

60

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

61

// Output auto initialization if not yet initialized

62

auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32);

63

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

64

Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));

65

66

AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));

67

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);

68

69

bool window_changed = update_window_and_padding(win, input_access, output_access);

70

Diego Lopez Recas

bcbc970

2017-12-18 11:28:27 +0000

[diff] [blame]

71

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

72

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

73

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

74

return std::make_pair(err, win);

75

}

76

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

77

Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

78

{

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

79

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

80

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

81

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

82

if(output->total_size() > 0)

83

{

84

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

85

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");

86

}

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

87

return Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

88

}

89

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

90

std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

91

{

92

constexpr unsigned int num_elems_processed_per_iteration = 16;

93

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

94

// Output auto initialization if not yet initialized

95

auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);

96

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

97

// Configure kernel window

98

Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));

99

100

AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));

101

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);

102

103

bool window_changed = update_window_and_padding(win, input_access, output_access);

104

Diego Lopez Recas

bcbc970

2017-12-18 11:28:27 +0000

[diff] [blame]

105

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

106

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

107

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

108

return std::make_pair(err, win);

}

} // namespace

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

112

INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

113

: _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

{

}

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

117

void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

118

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

119

// Perform validate step

120

ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);

121

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

122

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

123

_input = mtx_a;

124

_output = vector_sum_row;

125

_k = info.k;

126

_is_reshaped = info.is_reshaped;

127

_scalar = info.scalar;

128

_mul_by_scalar = info.mul_by_scalar;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

129

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

130

// Configure kernel window

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

131

auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);

132

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

133

INEKernel::configure(win_config.second);

134

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

135

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

136

Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

137

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

138

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

139

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

140

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

141

return Status{};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

142

}

143

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

144

template <typename T>

145

void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

146

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

147

// Intermediate and final accumulator types

148

using TIAcc = wrapper::traits::promote_t<T>;

149

using TAcc = wrapper::traits::promote_t<TIAcc>;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

150

151

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

152

153

Window win_input(collapsed_window);

154

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

155

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

156

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

157

158

Iterator in(_input, win_input);

159

Iterator out(_output, collapsed_window);

160

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

161

const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});

162

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

163

if(_is_reshaped)

164

{

165

execute_window_loop(collapsed_window, [&](const Coordinates & id)

166

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

167

auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

168

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

169

const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

170

171

#if __arm__

172

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

177

for(; i <= (_k - 4); i += 4)

178

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

179

const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

180

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

181

// Convert 8-bit to 16-bit

182

typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

183

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

184

wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),

185

wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),

186

wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),

187

wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

188

};

189

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

190

// Accumulate to 16-bit

191

a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);

192

a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);

193

a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

194

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

195

// Accumulate to 32-bit

196

sum_row = wrapper::vaddw(sum_row, a0_d16[0]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

197

}

198

199

// This for loop performs the leftover accumulations

200

for(; i < _k; ++i)

201

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

202

const auto a0_d8 = wrapper::vload(matrix_a + i * 4);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

203

204

// Convert U8 to U16

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

205

const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

206

207

// Accumulate to U32

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

208

sum_row = wrapper::vaddw(sum_row, a0_d16);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

209

}

210

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

211

// Multiply by scalar if necessary

212

if(_mul_by_scalar)

213

{

214

sum_row = wrapper::vmul(sum_row, vec_scalar);

215

}

216

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

217

auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());

218

Michele Di Giorgio

13ec5f0

2020-01-02 12:11:13 +0000

[diff] [blame]

219

wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

224

{

225

execute_window_loop(collapsed_window, [&](const Coordinates & id)

226

{

227

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

228

auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});

229

TAcc sum_row = 0;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

230

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

231

const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

232

233

#if __arm__

234

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 16 accumulations

239

for(; i <= (_k - 16); i += 16)

240

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

241

const auto a0_d8 = wrapper::vloadq(matrix_a + i);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

242

243

// Partial accumulations in U16

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

244

const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

245

246

// Accumulate to U32

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

247

vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

248

}

249

250

// This for loop performs the leftover accumulations

251

for(; i < _k; ++i)

252

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

253

sum_row += static_cast<TAcc>(matrix_a[i]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

254

}

255

256

#if defined(__aarch64__)

257

// Reduction operation available on 64 bit architectures only

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

258

sum_row += wrapper::vaddv(vsum_row);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

259

#else // __aarch64__

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

260

auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));

261

tmp = wrapper::vpadd(tmp, tmp);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

262

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

263

sum_row += wrapper::vgetlane(tmp, 0);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

264

#endif // __aarch64__

265

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

266

// Multiply by scalar if necessary

if(_mul_by_scalar)

{

sum_row *= _scalar;

}

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

272

*(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

}

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

278

void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)

279

{

280

ARM_COMPUTE_UNUSED(info);

281

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

282

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

283

284

switch(_input->info()->data_type())

285

{

286

case DataType::QASYMM8:

287

run_internal<uint8_t>(window);

288

break;

289

case DataType::QASYMM8_SIGNED:

290

case DataType::QSYMM8_PER_CHANNEL:

291

run_internal<int8_t>(window);

292

break;

293

default:

294

ARM_COMPUTE_ERROR("Unsupported data type");

}

}

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

298

void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

299

{

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

300

ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);

301

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

302

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

303

_input = mtx_b;

304

_output = vector_sum_col;

305

_k = info.k;

306

_is_reshaped = info.is_reshaped;

307

_scalar = info.scalar;

308

_mul_by_scalar = info.mul_by_scalar;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

309

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

310

// Configure kernel window

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

311

auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());

312

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

313

INEKernel::configure(win_config.second);

314

}

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

315

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

316

Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

317

{

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

318

ARM_COMPUTE_UNUSED(info);

Georgios Pinitas

2017-11-16 19:24:39 +0000

[diff] [blame]

319

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));

320

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

321

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

322

return Status{};

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

323

}

324

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

325

template <typename T>

326

void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

327

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

328

// Intermediate and final accumulator types

329

using TIAcc = wrapper::traits::promote_t<T>;

330

using TAcc = wrapper::traits::promote_t<TIAcc>;

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

331

332

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

333

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

334

const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});

335

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

336

if(_is_reshaped)

337

{

338

Window win_input(collapsed_window);

339

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

340

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

341

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

342

343

Iterator in(_input, win_input);

344

Iterator out(_output, collapsed_window);

345

346

execute_window_loop(collapsed_window, [&](const Coordinates & id)

347

{

348

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

349

typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

350

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

351

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

352

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

353

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

354

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

355

};

356

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

357

const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

358

359

#if __arm__

360

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

#endif /* __arm__ */

int i = 0;

for(; i < _k; ++i)

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

366

const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

367

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

368

// Convert 8bit to 16bit

369

const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

370

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

371

wrapper::vmovl(wrapper::vgetlow(b0_b8)),

372

wrapper::vmovl(wrapper::vgethigh(b0_b8))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

373

};

374

375

// Accumulate to U32

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

376

sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));

377

sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));

378

sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));

379

sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

380

}

381

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

382

// Multiply by scalar if necessary

383

if(_mul_by_scalar)

384

{

385

sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);

386

sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);

387

sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);

388

sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);

389

}

390

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

391

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

392

Michele Di Giorgio

13ec5f0

2020-01-02 12:11:13 +0000

[diff] [blame]

393

wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));

394

wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));

395

wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));

396

wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

401

{

402

const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));

403

const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);

404

405

// The implementation computes 16 elements per iteration

406

const int window_start_x = 16 * info.thread_id;

407

const int window_step_x = 16 * info.num_threads;

408

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

409

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

410

411

Window win_out(collapsed_window);

412

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

413

414

Window win_in(win_out);

415

win_in.set(Window::DimY, Window::Dimension(0, 0, 0));

416

win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

417

418

Iterator inb(_input, win_in);

419

Iterator out(_output, win_out);

420

421

execute_window_loop(win_out, [&](const Coordinates & id)

422

{

423

if(id.x() > width_matrix_b)

{

return;

}

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

429

typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

430

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

431

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

432

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

433

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),

434

wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

435

};

436

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

437

const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

438

439

#if __arm__

440

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

441

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

446

for(; i <= (_k - 4); i += 4)

447

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

448

const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);

449

const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);

450

const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);

451

const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

452

453

#if __arm__

454

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));

455

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));

456

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));

457

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));

458

#endif /* __arm__ */

459

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

460

// Partial accumulation in 16bit

461

typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

462

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

463

wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),

464

wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

465

};

466

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

467

tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));

468

tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));

469

tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));

470

tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));

471

tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));

472

tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));

473

tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));

474

tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

475

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

476

// Accumulate to 32bit

477

sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));

478

sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));

479

sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));

480

sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

481

482

matrix_b += 4 * in_b_stride;

483

}

484

485

// This for loop perfoms the leftover accumulations

486

for(; i < _k; ++i)

487

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

488

const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

489

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

490

// Convert S8 to S16

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

491

const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

492

{

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

493

wrapper::vmovl(wrapper::vgetlow(b0_b8)),

494

wrapper::vmovl(wrapper::vgethigh(b0_b8))

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

495

};

496

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

497

// Accumulate to 32bit

498

sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));

499

sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));

500

sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));

501

sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

502

503

matrix_b += in_b_stride;

504

}

505

Michele Di Giorgio

2020-03-12 19:34:33 +0000

[diff] [blame]

506

// Multiply by scalar if necessary

507

if(_mul_by_scalar)

508

{

509

sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);

510

sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);

511

sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);

512

sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);

513

}

514

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

515

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

516

Michele Di Giorgio

13ec5f0

2020-01-02 12:11:13 +0000

[diff] [blame]

517

wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));

518

wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));

519

wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));

520

wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));

Gian Marco Iodice

2017-10-09 15:05:40 +0100

[diff] [blame]

521

},

522

inb, out);

523

}

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

524

}

Georgios Pinitas

2019-10-16 19:21:40 +0100

[diff] [blame]

525

526

void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)

527

{

528

ARM_COMPUTE_UNUSED(info);

529

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

530

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

531

532

switch(_input->info()->data_type())

533

{

534

case DataType::QASYMM8:

535

run_internal<uint8_t>(window, info);

536

break;

537

case DataType::QASYMM8_SIGNED:

538

case DataType::QSYMM8_PER_CHANNEL:

539

run_internal<int8_t>(window, info);

540

break;

541

default:

542

ARM_COMPUTE_ERROR("Unsupported data type");

543

}

544

}

Michele Di Giorgio