Blame - src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp - ml/ComputeLibrary

2019-01-09 18:35:17 +0000

[diff] [blame]

1

/*

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

2

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#pragma once

#include <assert.h>

#include <algorithm>

#include "arm_gemm.hpp"

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

31

#include "bias_adder.hpp"

Michele Di Giorgio

6ad60af

2020-06-09 14:52:15 +0100

[diff] [blame]

32

#include "ndrange.hpp"

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

33

#include "utils.hpp"

Vincent ABRIOU

04c8e63

2020-05-27 16:26:46 +0200

[diff] [blame]

34

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

35

#include "mergeresults.hpp"

36

#include "transform.hpp"

37

38

#ifdef CYCLE_PROFILING

39

#include "profiler.hpp"

#endif

namespace arm_gemm {

// Implementation of the GemmCommon abstract class.

45

template<typename strategy, typename To, typename Tr>

46

class GemmHybrid : public GemmCommon<To, Tr> {

47

typedef typename strategy::operand_type Toi;

48

typedef typename strategy::result_type Tri;

49

50

/* const properties set by constructor */

51

const CPUInfo * const _ci;

52

53

const unsigned int _Msize;

54

const unsigned int _Nsize;

55

const unsigned int _Ksize;

56

57

const unsigned int _nbatches;

58

const unsigned int _nmulti;

59

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

60

const Activation _act;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

61

62

/* Blocking info */

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

63

const unsigned int _k_block;

64

const unsigned int _n_block;

65

const unsigned int _Mround;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

66

67

/* Pretransposed buffer. */

68

const Toi *_B_transposed=nullptr;

69

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

70

const NDRange<4> _window_range;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

71

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

72

static unsigned int compute_k_block(const GemmArgs &args) {

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

73

// Some kernels don't support accumulate mode - these can't do K blocking at all.

74

if (!strategy::supports_accumulate()) {

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

return args._Ksize;

}

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

78

if (args._cfg && args._cfg->inner_block_size) {

79

return args._cfg->inner_block_size;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

80

}

81

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

82

const unsigned int L1_size = args._ci->get_L1_cache_size();

83

84

// k_block: Find out how much of the larger array can be loaded into half the cache.

85

// This should account for associative caches.

86

unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));

87

88

// Needs to be (at least a single) multiple of the K unroll level.

89

k_block /= strategy::k_unroll();

90

k_block = std::max(k_block, 1U) * strategy::k_unroll();

91

92

// Now tune to presented problem size; this is how many blocks we need.

93

unsigned int numk_blocks = iceildiv(args._Ksize, k_block);

94

95

// So divide the space equally into that many blocks.

96

k_block = iceildiv(args._Ksize, numk_blocks);

97

98

// And round UP to the K unroll level required.

99

k_block = roundup(k_block, strategy::k_unroll());

return k_block;

}

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

104

static unsigned int compute_n_block(const GemmArgs &args) {

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

105

if (args._cfg && args._cfg->outer_block_size) {

106

return args._cfg->outer_block_size;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

107

}

108

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

109

const unsigned int k_block = compute_k_block(args);

110

const unsigned int L2_size = args._ci->get_L2_cache_size();

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

111

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

112

// n_block: Work out how many rows (of length k_block) will fit in the L2

113

// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.

114

unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

115

(sizeof(Toi) * k_block);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

116

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

117

// Needs to be (at least a single) multiple of the kernel output width.

118

n_block /= strategy::out_width();

119

n_block = std::max(n_block, 1U) * strategy::out_width();

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

120

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

121

// And tune to the presented problem size.

122

unsigned int numblocks = iceildiv(args._Nsize, n_block);

123

n_block = iceildiv(args._Nsize, numblocks);

124

n_block = roundup(n_block, strategy::out_width());

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

125

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

126

return n_block;

127

}

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

128

129

public:

130

GemmHybrid(GemmHybrid &) = delete;

131

GemmHybrid & operator= (GemmHybrid &) = delete;

132

133

/* Constructor */

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

134

GemmHybrid(const GemmArgs &args)

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

135

: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

136

_nbatches(args._nbatches), _nmulti(args._nmulti),

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

137

_act(args._act),

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

138

_k_block(compute_k_block(args)), _n_block(compute_n_block(args)),

139

_Mround(roundup(args._Msize, strategy::out_height())),

140

_window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

141

142

// Interface implementation - Compulsory functions

Joseph Dobson

6f8b17d

2020-02-11 19:32:11 +0000

[diff] [blame]

143

ndrange_t get_window_size() const override {

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

144

return { _window_range.total_size() };

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

145

}

146

147

// This kernel can always be dynamically scheduled.

148

bool supports_dynamic_scheduling() const override {

149

return true;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

150

}

151

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

152

// Execute

153

void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

154

#ifdef CYCLE_PROFILING

profiler prof;

#endif

strategy strat(_ci);

/* Make sure we've been set up correctly. */

160

assert(_B_transposed);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

161

static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");

162

static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");

163

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

164

/* For now, each work item implies all the K for a given output

165

* pixel (so we don't need to synchronize access to the output

166

* array). So separate the loop over K blocks here. */

167

for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {

168

unsigned int kmax = std::min(k0 + _k_block, _Ksize);

169

unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

170

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

171

const bool first_pass = (k0 == 0);

172

const bool last_pass = (kmax == _Ksize);

173

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

174

auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

175

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

176

if (p.done()) {

177

return;

178

}

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

179

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

180

do {

181

const unsigned int m_start = p.dim(0) * strategy::out_height();

182

const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);

183

const unsigned int batch = p.dim(1);

184

const unsigned int n0 = p.dim(2) * _n_block;

185

const unsigned int nmax = std::min(n0 + _n_block, _Nsize);

186

const unsigned int multi = p.dim(3);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

187

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

188

const Toi *b_panel = _B_transposed +

189

(multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +

190

(k0 * roundup(_Nsize, strategy::out_width())) +

191

(n0 * kern_k);

192

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

193

#ifdef CYCLE_PROFILING

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

194

auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

195

#endif

196

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

197

strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,

198

b_panel,

199

this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

200

(m_end - m_start), (nmax - n0), kmax-k0,

201

(strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,

202

last_pass ? _act : Activation(), !first_pass);

203

204

// Add bias externally if needed

205

if (!strategy::supports_bias() && this->_bias && first_pass) {

206

bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,

207

this->_bias + (multi * this->_bias_multi_stride) + n0,

208

(m_end - m_start), (nmax - n0));

209

}

210

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

211

} while (p.next_dim1());

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

}

}

// Interface implementation - pretransposed

216

bool B_is_pretransposed() const override {

return true;

}

bool B_pretranspose_required() const override {

221

return (_B_transposed==nullptr);

222

}

223

224

size_t get_B_pretransposed_array_size() const override {

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

225

return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

226

}

227

228

void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {

229

Toi *buffer = reinterpret_cast<Toi *>(in_buffer);

230

_B_transposed = buffer;

231

strategy strat(_ci);

232

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

233

for (unsigned int multi=0; multi<_nmulti; multi++) {

234

for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {

235

const unsigned int kmax = std::min(k0 + _k_block, _Ksize);

236

const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

237

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

238

for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {

239

const unsigned int xmax = std::min(x0+_n_block, _Nsize);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

240

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

241

const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

242

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

243

strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

244

x0, xmax, k0, kmax);

Georgios Pinitas

2019-01-09 18:35:17 +0000

[diff] [blame]

245

Georgios Pinitas

2019-01-23 11:24:50 +0000

[diff] [blame]

246

buffer += size;

247

}

248

}

Georgios Pinitas