Blame - src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp - ml/ComputeLibrary

2019-06-27 17:00:52 +0100

[diff] [blame]

1

/*

Georgios Pinitas

2021-07-16 16:16:43 +0100

[diff] [blame]

2

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#pragma once

#include <assert.h>

#include <algorithm>

#include "arm_gemm.hpp"

Michele Di Giorgio

6ad60af

2020-06-09 14:52:15 +0100

[diff] [blame]

31

#include "ndrange.hpp"

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

32

#include "utils.hpp"

Vincent ABRIOU

04c8e63

2020-05-27 16:26:46 +0200

[diff] [blame]

33

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

34

#include "mergeresults.hpp"

35

#include "transform.hpp"

36

37

#ifdef CYCLE_PROFILING

38

#include "profiler.hpp"

#endif

namespace arm_gemm {

// Implementation of the GemmCommon abstract class.

44

template<typename strategy, typename To, typename Tr>

45

class GemmHybridQuantized : public GemmCommon<To, Tr> {

46

typedef typename strategy::operand_type Toi;

47

typedef typename strategy::result_type Tri;

48

49

/* const properties set by constructor */

50

const CPUInfo * const _ci;

51

52

const unsigned int _Msize;

53

const unsigned int _Nsize;

54

const unsigned int _Ksize;

55

56

const unsigned int _nbatches;

57

const unsigned int _nmulti;

58

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

59

/* Blocking info */

60

const unsigned int _k_block;

61

const unsigned int _n_block;

62

const unsigned int _Mround;

63

64

/* Pretransposed buffer. */

65

const Toi *_B_transposed=nullptr;

66

67

const NDRange<4> _window_range;

68

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

69

Requantize32 _qp;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

70

int32_t *row_bias = nullptr;

71

int32_t *col_bias = nullptr;

72

73

void *working_space = nullptr;

74

75

unsigned int _nthreads;

76

77

unsigned int get_col_sum_size() const {

78

return _Nsize * _nmulti * sizeof(int32_t);

79

}

80

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

81

static unsigned int compute_k_block(const GemmArgs &args) {

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

82

// We don't support K blocks as we only temporarily store 32 bit results.

83

return args._Ksize;

Georgios Pinitas

2021-07-16 16:16:43 +0100

[diff] [blame]

84

85

if (args._cfg && args._cfg->inner_block_size) {

86

return args._cfg->inner_block_size;

87

}

88

89

const unsigned int L1_size = args._ci->get_L1_cache_size();

90

91

// k_block: Find out how much of the larger array can be loaded into half the cache.

92

// This should account for associative caches.

93

unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));

94

95

// Needs to be (at least a single) multiple of the K unroll level.

96

k_block /= strategy::k_unroll();

97

k_block = std::max(k_block, 1U) * strategy::k_unroll();

98

99

// Now tune to presented problem size; this is how many blocks we need.

100

unsigned int numk_blocks = iceildiv(args._Ksize, k_block);

101

102

// So divide the space equally into that many blocks.

103

k_block = iceildiv(args._Ksize, numk_blocks);

104

105

// And round UP to the K unroll level required.

106

k_block = roundup(k_block, strategy::k_unroll());

107

108

return k_block;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

109

}

110

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

111

static unsigned int compute_n_block(const GemmArgs &args) {

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

112

if (args._cfg && args._cfg->outer_block_size) {

Georgios Pinitas

2021-07-16 16:16:43 +0100

[diff] [blame]

113

unsigned int n_block = args._cfg->outer_block_size;

114

115

// Needs to be (at least a single) multiple of the kernel output width.

116

n_block /= strategy::out_width();

117

n_block = std::max(n_block, 1u) * strategy::out_width();

118

119

return n_block;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

120

}

121

122

const unsigned int k_block = compute_k_block(args);

123

const unsigned int L2_size = args._ci->get_L2_cache_size();

124

125

// n_block: Work out how many rows (of length k_block) will fit in the L2

126

// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.

Georgios Pinitas

c0b6f76

2020-11-02 01:37:17 +0000

[diff] [blame]

127

const unsigned int scaled_l2_size = (L2_size * 9) / 10;

128

const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());

129

130

// .. if the L1 contents is bigger than the L2, just return a minimal size block.

131

if (k_block_area > scaled_l2_size) {

132

return strategy::out_width();

133

}

134

135

unsigned int n_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

136

137

// Needs to be (at least a single) multiple of the kernel output width.

138

n_block /= strategy::out_width();

Georgios Pinitas

c0b6f76

2020-11-02 01:37:17 +0000

[diff] [blame]

139

n_block = std::max(n_block, 1u) * strategy::out_width();

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

140

141

// And tune to the presented problem size.

142

unsigned int numblocks = iceildiv(args._Nsize, n_block);

143

n_block = iceildiv(args._Nsize, numblocks);

144

n_block = roundup(n_block, strategy::out_width());

145

Georgios Pinitas

c0b6f76

2020-11-02 01:37:17 +0000

[diff] [blame]

146

assert(n_block > 0);

147

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

return n_block;

}

public:

GemmHybridQuantized(GemmHybridQuantized &) = delete;

153

GemmHybridQuantized & operator= (GemmHybridQuantized &) = delete;

154

155

/* Constructor */

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

156

GemmHybridQuantized(const GemmArgs &args, const Requantize32 &qp)

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

157

: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

158

_nbatches(args._nbatches), _nmulti(args._nmulti),

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

159

_k_block(compute_k_block(args)), _n_block(compute_n_block(args)),

160

_Mround(roundup(args._Msize, strategy::out_height())),

161

_window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti),

162

_qp (qp), _nthreads(args._maxthreads) { }

163

164

// Interface implementation - Compulsory functions

Joseph Dobson

6f8b17d

2020-02-11 19:32:11 +0000

[diff] [blame]

165

ndrange_t get_window_size() const override {

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

166

return { _window_range.total_size() };

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

167

}

168

169

// This kernel can always be dynamically scheduled.

170

bool supports_dynamic_scheduling() const override {

return true;

}

Georgios Pinitas

2020-07-02 20:02:20 +0100

[diff] [blame]

174

// Execute

175

void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

176

#ifdef CYCLE_PROFILING

profiler prof;

#endif

strategy strat(_ci);

uintptr_t working_int = reinterpret_cast<uintptr_t>(working_space);

182

183

Tri *result_buffer = reinterpret_cast<Tri *>(working_int + (threadid * strategy::out_height() * _Nsize * sizeof(Tri)));

184

185

/* Make sure we've been set up correctly. */

186

assert(_B_transposed);

187

static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");

188

189

/* For now, each work item implies all the K for a given output

190

* pixel (so we don't need to synchronize access to the output

191

* array). So separate the loop over K blocks here. */

192

for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {

193

unsigned int kmax = std::min(k0 + _k_block, _Ksize);

194

unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());

195

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

196

auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

if (p.done()) {

return;

}

do {

const unsigned int m_start = p.dim(0) * strategy::out_height();

204

const unsigned int m_end = std::min((p.dim(0) + 1) * strategy::out_height(), _Msize);

205

const unsigned int batch = p.dim(1);

206

const unsigned int n0 = p.dim(2) * _n_block;

207

const unsigned int nmax = std::min(n0 + _n_block, _Nsize);

208

const unsigned int multi = p.dim(3);

209

210

int32_t local_row_sums[strategy::out_height()];

211

212

const Toi *b_panel = _B_transposed +

213

(multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +

214

(k0 * roundup(_Nsize, strategy::out_width())) +

(n0 * kern_k);

{

#ifdef CYCLE_PROFILING

219

auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));

220

#endif

221

strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,

222

b_panel,

Michalis Spyrou

3e183d9

2019-08-23 15:31:08 +0100

[diff] [blame]

223

result_buffer, (nmax-n0),

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

224

(m_end - m_start), (nmax - n0), kern_k,

225

nullptr, Activation(), false);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

}

{

#ifdef CYCLE_PROFILING

230

auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (m_end - m_start) * _Ksize);

231

#endif

232

compute_row_sums(_qp, _Ksize, (m_end - m_start),

233

this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda), this->_lda,

234

local_row_sums);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

}

{

#ifdef CYCLE_PROFILING

239

auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (m_end - m_start) * _Nsize);

240

#endif

241

242

requantize_block_32(_qp, (nmax - n0), (m_end - m_start), result_buffer, (nmax - n0),

243

this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,

Georgios Pinitas

af56d52

2020-07-01 12:35:30 +0100

[diff] [blame]

244

local_row_sums, col_bias + (multi * _Nsize) + n0, n0);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

245

}

246

} while (p.next_dim0());

}

}

// Working space needed for intermediate result buffers.

251

size_t get_working_size() const override {

252

return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri));

253

}

254

255

void set_working_space(void *buffer) override {

256

working_space = buffer;

257

}

258

259

// Interface implementation - pretransposed

260

bool B_is_pretransposed() const override {

return true;

}

bool B_pretranspose_required() const override {

265

return (_B_transposed==nullptr);

266

}

267

268

size_t get_B_pretransposed_array_size() const override {

269

return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));

270

}

271

Giorgio Arena

63e0beb

2021-09-24 14:04:27 +0100

[diff] [blame^]

272

void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

273

col_bias = reinterpret_cast<int32_t *>(in_buffer);

274

275

for (unsigned int i=0; i<_nmulti; i++) {

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

276

compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize, i, 0);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

277

}

Giorgio Arena

63e0beb

2021-09-24 14:04:27 +0100

[diff] [blame^]

278

}

279

280

void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {

281

requantize_bias(in_buffer, B, ldb, B_multi_stride);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

282

283

uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);

284

Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());

285

_B_transposed = buffer;

286

strategy strat(_ci);

287

288

for (unsigned int multi=0; multi<_nmulti; multi++) {

289

for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {

290

const unsigned int kmax = std::min(k0 + _k_block, _Ksize);

291

const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());

292

293

for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {

294

const unsigned int xmax = std::min(x0+_n_block, _Nsize);

295

296

const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;

297

298

strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

299

x0, xmax, k0, kmax);

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

buffer += size;

}

}

}

}

void set_pretransposed_B_data(void *in_buffer) override {

308

uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);

309

_B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());

310

col_bias = reinterpret_cast<int32_t *>(in_buffer);

311

}

312

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

313

void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

314

_qp.bias = bias;

Georgios Pinitas

2019-10-14 19:03:09 +0100

[diff] [blame]

315

_qp.bias_multi_stride = bias_multi_stride;

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

316

}

Georgios Pinitas

2021-07-16 16:16:43 +0100

[diff] [blame]

317

318

GemmConfig get_config() override {

319

GemmConfig c;

320

321

c.method = GemmMethod::GEMM_HYBRID;

322

c.inner_block_size = _k_block;

323

c.outer_block_size = _n_block;

324

c.filter = get_type_name<strategy>();

325

326

return c;

327

}

Georgios Pinitas