Blame - src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp - ml/ComputeLibrary

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"

25

26

#include "arm_compute/core/AccessWindowStatic.h"

27

#include "arm_compute/core/Error.h"

28

#include "arm_compute/core/Helpers.h"

29

#include "arm_compute/core/ITensor.h"

30

#include "arm_compute/core/TensorInfo.h"

31

#include "arm_compute/core/Types.h"

32

#include "arm_compute/core/Utils.h"

33

#include "arm_compute/core/Validate.h"

34

#include "arm_compute/core/Window.h"

35

36

#include <arm_neon.h>

#include <cstddef>

#include <cstdint>

using namespace arm_compute;

41

42

namespace arm_compute

43

{

44

class Coordinates;

45

} // namespace arm_compute

46

47

INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()

48

: _input(), _output(), _k(0), _is_reshaped(false)

{

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

52

void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

53

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

54

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

55

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);

56

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

57

_input = mtx_a;

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

58

_output = vector_sum_row;

59

_k = num_mtx_a_cols;

60

_is_reshaped = is_interleaved4x4;

61

62

const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1;

63

64

// Configure kernel window

65

Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));

66

67

AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));

68

AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);

69

70

update_window_and_padding(win,

input_access,

output_access);

output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));

75

76

INEKernel::configure(win);

77

}

78

79

void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)

80

{

81

ARM_COMPUTE_UNUSED(info);

82

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

83

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

84

85

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

86

87

Window win_input(collapsed_window);

88

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

89

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

90

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

91

92

Iterator in(_input, win_input);

93

Iterator out(_output, collapsed_window);

if(_is_reshaped)

{

execute_window_loop(collapsed_window, [&](const Coordinates & id)

98

{

99

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

100

uint32x4_t sum_row = vdupq_n_u32(0);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

101

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

102

const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

103

104

#if __arm__

105

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

110

for(; i <= (_k - 4); i += 4)

111

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

112

const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

113

114

// Convert U8 to U16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

115

uint16x4x4_t a0_u16 =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

116

{

117

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

118

vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),

119

vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),

120

vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),

121

vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

126

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);

127

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);

128

a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

129

130

// Accumulate to U32

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

131

sum_row = vaddw_u16(sum_row, a0_u16.val[0]);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

132

}

133

134

// This for loop performs the leftover accumulations

135

for(; i < _k; ++i)

136

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

137

const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

138

139

// Convert U8 to U16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

140

const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

141

142

// Accumulate to U32

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

143

sum_row = vaddw_u16(sum_row, a0_u16);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

144

}

145

146

auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());

147

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

148

vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

153

{

154

execute_window_loop(collapsed_window, [&](const Coordinates & id)

155

{

156

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

157

uint32x4_t sum_row_u32 = vdupq_n_u32(0);

158

uint32_t sum_row = 0;

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

159

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

160

const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

161

162

#if __arm__

163

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));

#endif /* __arm__ */

int i = 0;

// This for loop performs 16 accumulations

168

for(; i <= (_k - 16); i += 16)

169

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

170

const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

171

172

// Partial accumulations in U16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

173

const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

174

175

// Accumulate to U32

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

176

sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

177

}

178

179

// This for loop performs the leftover accumulations

180

for(; i < _k; ++i)

181

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

182

sum_row += static_cast<uint32_t>(matrix_a[i]);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

183

}

184

185

#if defined(__aarch64__)

186

// Reduction operation available on 64 bit architectures only

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

187

sum_row += vaddvq_u32(sum_row_u32);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

188

#else // __aarch64__

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

189

uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));

190

tmp = vpadd_u32(tmp, tmp);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

191

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

192

sum_row += vget_lane_u32(tmp, 0);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

193

#endif // __aarch64__

194

195

*(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);

},

in, out);

}

}

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

201

void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

202

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

203

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

204

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);

205

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

206

_input = mtx_b;

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

207

_output = vector_sum_col;

208

_k = num_mtx_b_rows;

209

_is_reshaped = is_transposed1xW;

210

211

constexpr unsigned int num_elems_processed_per_iteration = 16;

212

213

// Configure kernel window

214

Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));

215

216

AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));

217

AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);

218

219

update_window_and_padding(win,

input_access,

output_access);

output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));

224

225

INEKernel::configure(win);

226

}

227

228

void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)

229

{

230

ARM_COMPUTE_UNUSED(info);

231

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

232

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

233

234

Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);

if(_is_reshaped)

{

Window win_input(collapsed_window);

239

win_input.set(Window::DimX, Window::Dimension(0, 0, 0));

240

win_input.set(Window::DimY, Window::Dimension(0, 0, 0));

241

win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));

242

243

Iterator in(_input, win_input);

244

Iterator out(_output, collapsed_window);

245

246

execute_window_loop(collapsed_window, [&](const Coordinates & id)

247

{

248

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

249

uint32x4x4_t sum_col =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

250

{

251

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

259

const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

260

261

#if __arm__

262

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

#endif /* __arm__ */

int i = 0;

for(; i < _k; ++i)

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

268

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

269

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

270

// Convert S8 to U16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

271

const uint16x8x2_t b0_u16 =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

272

{

273

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

274

vmovl_u8(vget_low_u8(b0_u8)),

275

vmovl_u8(vget_high_u8(b0_u8))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U32

sum_col =

{

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

283

vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),

284

vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),

285

vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),

286

vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

}

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

292

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

293

vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));

294

vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));

295

vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));

296

vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

},

in, out);

}

else // it is not reshaped

301

{

302

const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));

303

const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);

304

305

// The implementation computes 16 elements per iteration

306

const int window_start_x = 16 * info.thread_id;

307

const int window_step_x = 16 * info.num_threads;

308

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

309

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

310

311

Window win_out(collapsed_window);

312

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

313

314

Window win_in(win_out);

315

win_in.set(Window::DimY, Window::Dimension(0, 0, 0));

316

win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

317

318

Iterator inb(_input, win_in);

319

Iterator out(_output, win_out);

320

321

execute_window_loop(win_out, [&](const Coordinates & id)

322

{

323

if(id.x() > width_matrix_b)

{

return;

}

// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

329

uint32x4x4_t sum_col =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

330

{

331

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0),

vdupq_n_u32(0)

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

339

const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

340

341

#if __arm__

342

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));

343

asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));

#endif /* __arm__ */

int i = 0;

// This for loop performs 4 accumulations

348

for(; i <= (_k - 4); i += 4)

349

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

350

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);

351

const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);

352

const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);

353

const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

354

355

#if __arm__

356

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));

357

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));

358

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));

359

asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));

360

#endif /* __arm__ */

361

362

// Partial accumulation in u16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

363

uint16x8x2_t tmp_sum =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

364

{

365

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

366

vdupq_n_u16(0),

367

vdupq_n_u16(0)

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

Gian Marco

2017-11-08 12:24:09 +0000

[diff] [blame]

371

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));

372

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));

373

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));

374

tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));

375

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));

376

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));

377

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));

378

tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

// Accumulate to U32

sum_col =

{

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

384

vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),

385

vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),

386

vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),

387

vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

matrix_b += 4 * in_b_stride;

392

}

393

394

// This for loop perfoms the leftover accumulations

395

for(; i < _k; ++i)

396

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

397

const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

398

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

399

// Convert S8 to S16

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

400

const uint16x8x2_t b0_u16 =

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

401

{

402

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

403

vmovl_u8(vget_low_u8(b0_u8)),

404

vmovl_u8(vget_high_u8(b0_u8))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

// Accumulate to U32

sum_col =

{

{

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

412

vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),

413

vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),

414

vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),

415

vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

}

};

matrix_b += in_b_stride;

420

}

421

422

auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());

423

Gian Marco

e75a02b

2017-11-08 12:24:09 +0000

[diff] [blame]

424

vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));

425

vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));

426

vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));

427

vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));

Gian Marco Iodice

ab18212

2017-10-09 15:05:40 +0100

[diff] [blame]

428

},

429

inb, out);

430

}

Pablo Tello

6ff12a0

2017-11-02 16:09:35 +0000

[diff] [blame]

431

}