Blame - src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp - ml/ComputeLibrary

2018-02-23 13:43:50 +0000

[diff] [blame]

1

/*

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

2

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#pragma once

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

26

#include <stdio.h>

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

27

#include <assert.h>

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

#include <algorithm>

#include "arm_gemm.hpp"

32

#include "utils.hpp"

33

34

#include "buffer_manager.hpp"

35

#include "mergeresults.hpp"

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

36

#include "transform.hpp"

37

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

38

#ifdef CYCLE_PROFILING

39

#include "profiler.hpp"

40

#endif

41

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

42

// Some macros used to decide how much working space to allocate.

43

// Round allocations up to the next cache line.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

44

#define ALLOC_ROUND 64

45

#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

46

47

// Implementation of the GemmCommon abstract class.

48

//

49

// This implementation interleaves the source matrices in blocks - good for

50

// larger matrices.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

51

namespace arm_gemm {

52

53

template<typename strategy, typename To, typename Tr>

54

class GemmInterleaved : public GemmCommon<To, Tr> {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

55

typedef typename strategy::operand_type Toi;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

56

typedef typename strategy::result_type Tri;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

57

58

/* const properties set by constructor */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

59

const CPUInfo * const _ci;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

60

61

const unsigned int _Msize;

62

const unsigned int _Nsize;

63

const unsigned int _Ksize;

64

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

65

const unsigned int _nbatches;

66

const unsigned int _nmulti;

67

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

const bool _trA;

const bool _trB;

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

71

const Activation _act;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

72

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

73

const int _maxthreads;

74

int _nthreads;

75

const bool _pretransposed;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

76

77

/* Blocking info */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

78

unsigned int _k_block=0;

79

unsigned int _x_block=0;

80

unsigned int _Mround=0;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

81

82

/* Working space, pretransposed buffer, buffer manager */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

83

const Toi *_B_transposed=nullptr;

84

BufferManager *_bm=nullptr;

85

void *_working_space=nullptr;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

86

87

/* We will need to walk through the blocks of B in a few contexts, so

88

* factor that out. */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

89

class blockwalker {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

90

private:

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

91

/* Size loops, etc. based on our parent's configuration */

92

const GemmInterleaved<strategy, To, Tr> &_parent;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

93

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

94

/* K, X and multi parameters for current iteration. */

95

unsigned int _k0=0, _x0=0, _multi=0;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

96

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

97

unsigned int _index=0;

98

bool _done=false;

99

bool _newkblock=true;

100

bool _newmulti=true;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

101

102

public:

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

103

blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

104

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

105

unsigned int xmax() {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

106

return std::min(_x0 + _parent._x_block, _parent._Nsize);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

107

}

108

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

109

unsigned int kmax() {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

110

return std::min(_k0 + _parent._k_block, _parent._Ksize);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

111

}

112

113

/* Advance to the next block, return false at the end. */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

114

bool advance(void) {

115

if (_done) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

return false;

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

119

_newkblock=false;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

120

_x0 += _parent._x_block;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

121

if (_x0 >= _parent._Nsize) {

122

_x0=0;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

123

_k0 += _parent._k_block;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

124

if (_k0 >= _parent._Ksize) {

125

_k0=0;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

126

_multi++;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

127

if (_multi >= _parent._nmulti) {

128

_done=true;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

129

return false;

130

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

131

_newmulti=true;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

132

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

133

_newkblock=true;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}

_index++;

return true;

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

140

unsigned int k0(void) { return _k0; }

141

unsigned int x0(void) { return _x0; }

142

unsigned int multi(void) { return _multi; }

143

unsigned int index(void) { return _index; }

144

bool done(void) { return _done; }

145

bool newkblock(void) { return _newkblock; }

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

146

};

147

148

// A working size: One of these needed, regardless of thread count. Divided according to window.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

149

size_t get_a_working_size() const {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

150

return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

151

}

152

153

// B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

154

size_t get_b_working_size() const {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

155

return ROUND_UP(sizeof(Toi) * _x_block * _k_block);

156

}

157

158

// C working size: One needed per thread.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

159

size_t get_c_working_size() const {

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

160

return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

161

}

162

163

// Internal execute function.

164

// This supports both the "pretransposed" and "standard" interfaces via the template parameter.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

165

template<bool pretransposed>

166

void execute_internal(unsigned int start, unsigned int end, int threadid) {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

167

#ifdef CYCLE_PROFILING

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

168

profiler prof;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

169

#endif

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

170

strategy strat(_ci);

171

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

172

blockwalker current(*this);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

173

blockwalker next=current;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

174

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

175

/* Translate 'start' and 'end' into a position within the batches and rows. */

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

176

const unsigned int window_per_batch = _Mround / strategy::out_height();

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

177

unsigned int batch_0 = start / window_per_batch;

178

unsigned int batch_end = end / window_per_batch;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

179

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

180

/* Compute the M values to operate on */

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

181

unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();

182

unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

183

184

/* Make sure we've been set up correctly. */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

185

if (pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

186

assert(_B_transposed);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

187

} else {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

assert(_bm);

}

assert(_working_space);

192

int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);

193

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

194

// Private buffers. Treat working_space as an array of C buffers

195

// (one per thread) first, followed by the (window-divided) A

196

// buffer.

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

197

// Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

198

Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));

199

Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

200

201

// Shared buffers - these come either from BufferManager or _B_transposed.

202

const Toi *b_panel;

203

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

204

if (pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

205

b_panel = _B_transposed;

206

}

207

208

//printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);

209

210

// newkblock() is always true on the first iteration, so this will be set properly on the first loop.

211

int kern_k = 0;

212

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

213

for (;!current.done();current.advance()) {

214

if (current.newkblock()) {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

215

#ifdef CYCLE_PROFILING

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

216

auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

217

#endif

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

218

for (unsigned int batch = batch_0; batch <= batch_end; batch++) {

219

unsigned int first_m = (batch == batch_0) ? m_0 : 0;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

220

unsigned int last_m = (batch == batch_end) ? m_max : _Msize;

221

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

222

if (first_m >= last_m)

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

223

continue;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

224

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

225

strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),

226

this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),

227

this->_lda, first_m, last_m, current.k0(), current.kmax(), _trA);

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

228

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

229

230

// Figure out how many "K" the kernel will actually process.

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

231

kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());

232

kern_k *= strat.k_unroll();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

233

}

234

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

235

int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

236

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

237

if (!pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

238

/* Look ahead to the next block and populate it if necessary.

239

* This avoids the populate operation becoming a bottleneck, and

240

* helps keep the threads synchronized (the first thread to get

241

* here will populate while the rest will advance).

242

*

243

* If we are running single threaded, bm->try_populate() will do

244

* nothing.

245

*/

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

246

if (next.advance()) {

247

_bm->try_populate(next.index(), [&](void *buffer) {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

248

#ifdef CYCLE_PROFILING

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

249

auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

250

#endif

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

251

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

252

Toi *b_panel = reinterpret_cast<Toi *>(buffer);

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

253

254

strat.transforms.PrepareB(b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,

255

next.x0(), next.xmax(), next.k0(), next.kmax(), _trB);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

256

});

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

257

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

258

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

259

/* Get the buffer for this iteration from the BufferManager. */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

260

b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

261

#ifdef CYCLE_PROFILING

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

262

auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

263

#endif

264

265

Toi *b_panel = reinterpret_cast<Toi *>(bpv);

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

266

267

strat.transforms.PrepareB(b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,

268

current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}));

}

/* Do the actual work. */

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

273

for (unsigned int batch = batch_0; batch <= batch_end; batch++) {

274

unsigned int first_m = (batch == batch_0) ? m_0 : 0;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

275

unsigned int last_m = (batch == batch_end) ? m_max : _Msize;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

276

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

277

const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;

278

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

279

if (first_m >= last_m)

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

280

continue;

281

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

282

for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {

283

unsigned int ymax = std::min(_Msize, y + strategy::out_height());

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

284

285

{

286

#ifdef CYCLE_PROFILING

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

287

auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

288

#endif

289

290

strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);

291

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

292

a_ptr += (strategy::out_height() * kern_k);

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

}

{

#ifdef CYCLE_PROFILING

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

297

auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

298

#endif

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

299

/* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */

300

const bool first_pass = current.k0()==0;

301

const bool last_pass = current.kmax()==_Ksize;

302

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

303

strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),

304

c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

305

((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),

306

(last_pass ? _act : Activation()), !first_pass);

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

307

}

308

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

309

}

310

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

311

if (pretransposed) {

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

312

b_panel += (bblocks * strat.out_width() * kern_k);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

313

} else {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

314

_bm->release(current.index());

}

}

}

public:

GemmInterleaved(GemmInterleaved &) = delete;

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

321

GemmInterleaved & operator= (GemmInterleaved &) = delete;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

322

323

/* Constructor */

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

324

GemmInterleaved(const GemmArgs &args)

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

325

: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),

326

_nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

327

_act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

328

_pretransposed(args._pretransposed_hint) {

David Mansell

e39334c

2018-07-06 17:53:35 +0100

[diff] [blame]

329

const unsigned int L1_size = _ci->get_L1_cache_size();

330

const unsigned int L2_size = _ci->get_L2_cache_size();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

331

David Mansell

e39334c

2018-07-06 17:53:35 +0100

[diff] [blame]

332

assert(_maxthreads > 0);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

333

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

334

// Work out blocking parameters, or override from provided GemmConfig

335

if (args._cfg && args._cfg->inner_block_size) {

336

_k_block = args._cfg->inner_block_size;

337

} else {

338

// k_block: Find out how much of the larger array can be loaded into half the cache.

339

// This should account for associative caches.

340

_k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

341

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

342

// Needs to be (at least a single) multiple of the K unroll level.

343

_k_block /= strategy::k_unroll();

344

_k_block = std::max(_k_block, 1U) * strategy::k_unroll();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

345

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

346

// Now tune to presented problem size; this is how many blocks we need.

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

347

unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

348

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

349

// So divide the space equally into that many blocks.

350

_k_block = iceildiv(_Ksize, num_k_blocks);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

351

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

352

// And round UP to the K unroll level required.

353

_k_block = iceildiv(_k_block, strategy::k_unroll());

354

_k_block *= strategy::k_unroll();

355

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

356

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

357

if (args._cfg && args._cfg->outer_block_size) {

358

_x_block = args._cfg->outer_block_size;

359

} else {

360

// x_block: Work out how many rows (of length k_block) will fit in the L2

361

// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.

362

_x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

363

(sizeof(Toi) * _k_block);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

364

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

365

// Needs to be (at least a single) multiple of the kernel output width.

366

_x_block /= strategy::out_width();

367

_x_block = std::max(_x_block, 1U) * strategy::out_width();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

368

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

369

// And tune to the presented problem size.

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

370

unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

371

_x_block = iceildiv(_Nsize, num_x_blocks);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

372

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

373

_x_block = iceildiv(_x_block, strategy::out_width());

374

_x_block *= strategy::out_width();

375

}

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

376

377

// Work out the rounded size of M - needed for some buffers.

David Mansell

e39334c

2018-07-06 17:53:35 +0100

[diff] [blame]

378

_Mround = iceildiv(_Msize, strategy::out_height());

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

379

_Mround *= strategy::out_height();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

380

}

381

382

// Interface implementation - Compulsory functions

383

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

384

// Window size: Only the last thread should do a ragged block, so dole

385

// out work in units of out_height. Factor batches into the window, but

386

// not multi for now (as this would cause problems with the buffer

387

// manager).

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

388

unsigned int get_window_size() const override {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

389

// _Mround is a multiple of out_height by definition.

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

390

return (_Mround / strategy::out_height()) * _nbatches;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

391

}

392

393

// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

394

void set_nthreads(int nthreads) override {

395

_nthreads = std::min(nthreads, _maxthreads);

396

if (_bm) {

397

_bm->set_nthreads(_nthreads);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

}

}

// Execute

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

402

void execute(unsigned int start, unsigned int end, int threadid) override {

403

if (_pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

404

execute_internal<true>(start, end, threadid);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

405

} else {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

406

execute_internal<false>(start, end, threadid);

}

}

// Interface implementation - working space

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

411

size_t get_working_size() const override {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

412

// In all cases, we need one A buffer plus a C buffer per thread.

413

size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);

414

415

// For pretransposed case, there is no working space needed for B.

416

// Otherwise, we need a BufferManager.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

417

if (!_pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

418

size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());

419

}

420

421

size += 64; // Add on a cache line extra for alignment.

return size;

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

426

void set_working_space(void *working_space) override {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

427

// Make sure everything ends up cache line aligned

428

int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

429

intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

430

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

431

size_t diff=0;

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

432

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

433

if (working_space_int & 0x3F) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

434

diff = 0x40 - (working_space_int & 0x3F);

435

}

436

437

working_space_bytes += diff;

438

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

439

if (_pretransposed) {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

440

// Pretransposed case: just set internal pointer to parameter value.

441

_working_space = reinterpret_cast<void *>(working_space_bytes);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

442

} else {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

443

// Otherwise, use the first part of the working space for the buffer manager.

444

// It's legal to call this again so don't leak a buffer manager if it already existed.

445

delete _bm;

446

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

447

_bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

448

449

working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());

450

451

_working_space = reinterpret_cast<void *>(working_space_bytes);

}

}

// Interface implementation - pretransposed

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

456

bool B_is_pretransposed() const override {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

457

return _pretransposed;

458

}

459

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

460

bool B_pretranspose_required() const override {

461

return _pretransposed && (_B_transposed==nullptr);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

462

}

463

464

// TODO: this could almost certainly be considerably simpler.

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

465

size_t get_B_pretransposed_array_size() const override {

466

size_t total=0;

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

467

blockwalker current(*this);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

468

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

469

do {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

470

/* Figure out the size of each block. */

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

471

unsigned int x_size = (current.xmax() - current.x0());

472

unsigned int k_size = (current.kmax() - current.k0());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

473

474

/* Round sizes up as needed. */

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

475

x_size = iceildiv(x_size, strategy::out_width());

476

x_size *= strategy::out_width();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

477

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

478

k_size = iceildiv(k_size, strategy::k_unroll());

479

k_size *= strategy::k_unroll();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

480

481

total += x_size * k_size * sizeof(Toi);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

482

} while (current.advance());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

return total;

}

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

487

void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

488

blockwalker current(*this);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

489

Toi *buffer = reinterpret_cast<Toi *>(in_buffer);

490

_B_transposed = buffer;

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

491

strategy strat(_ci);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

492

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

493

do {

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

494

/* Figure out the size of each block. */

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

495

unsigned int x_size = (current.xmax() - current.x0());

496

unsigned int k_size = (current.kmax() - current.k0());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

497

498

/* Round sizes up as needed. */

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

499

x_size = iceildiv(x_size, strategy::out_width());

500

x_size *= strategy::out_width();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

501

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

502

k_size = iceildiv(k_size, strategy::k_unroll());

503

k_size *= strategy::k_unroll();

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

504

David Mansell

2018-07-06 14:52:52 +0100

[diff] [blame]

505

strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,

506

current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

507

508

buffer += (x_size * k_size);

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

509

} while (current.advance());

Pablo Tello

2018-02-23 13:43:50 +0000

[diff] [blame]

510

}

511

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

512

void set_pretransposed_B_data(void *in_buffer) override {

Michalis Spyrou

2018-04-13 13:44:10 +0100

[diff] [blame]

513

_B_transposed = reinterpret_cast<Toi *>(in_buffer);

514

}

515

Anthony Barbier

2018-07-03 16:22:02 +0100

[diff] [blame]

516

~GemmInterleaved() override {

Pablo Tello