blob: 7f870b83d745606a9c3c16821a579d31463a873e [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2017-2020 Arm Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
Pablo Telloeb82fd22018-02-23 13:43:50 +000026#include <algorithm>
David Mansell318c9f42020-07-08 13:28:45 +010027#include <cassert>
Pablo Telloeb82fd22018-02-23 13:43:50 +000028
29#include "arm_gemm.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000030#include "convolver.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000031#include "mergeresults.hpp"
David Mansell318c9f42020-07-08 13:28:45 +010032#include "performance_parameters.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000033#include "quantized.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000034#include "transform.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000035#include "utils.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000036
Michalis Spyroue7e96e02018-04-13 13:44:10 +010037#ifdef CYCLE_PROFILING
38#include "profiler.hpp"
39#endif
40
Pablo Telloeb82fd22018-02-23 13:43:50 +000041// Some macros used to decide how much working space to allocate.
42// Round allocations up to the next cache line.
Anthony Barbier5f707732018-07-03 16:22:02 +010043#define ALLOC_ROUND 64
44#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
Pablo Telloeb82fd22018-02-23 13:43:50 +000045
46// Implementation of the GemmCommon abstract class.
47//
48// This implementation interleaves the source matrices in blocks - good for
49// larger matrices.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000050
Anthony Barbier5f707732018-07-03 16:22:02 +010051namespace arm_gemm {
52
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000053namespace {
54
55// Some kernels output to a linear buffer and require a separate merge step.
56// Others output directly to the matrix result. This helper class calls the
57// appropriate functions, using templating to avoid calling non-existent
58// functions.
59template<bool MergeStep, typename OutputStage>
60class kernel_and_merge {
61public:
62 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
63 static void run (
64#ifdef CYCLE_PROFILING
65 profiler &prof,
66#endif
67 strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
68 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
69 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
70 const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
71 Tab *acc_buff);
72};
73
74// Run a kernel and call the separate merge step
75template<>
76template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
77void kernel_and_merge<true, Nothing>::run(
78#ifdef CYCLE_PROFILING
79 profiler &prof,
80#endif
81 strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
82 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
83 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
84 const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
85{
86 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
87
88 {
89#ifdef CYCLE_PROFILING
90 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
91#endif
92
93 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
94 }
95
96 {
97#ifdef CYCLE_PROFILING
98 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
99#endif
100 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
101 }
102}
103
104// Run a kernel with integrated merge
105template<>
106template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
107void kernel_and_merge<false, Nothing>::run(
108#ifdef CYCLE_PROFILING
109 profiler &prof,
110#endif
111 strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
112 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
113 unsigned int n_0, unsigned int n_max, const Tr *biasptr,
114 const Activation &act, bool accumulate, const Nothing &, const int32_t *,
115 Tab *acc_buff)
116{
117#ifdef CYCLE_PROFILING
118 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
119#endif
120
121 // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
122 // to be careful not to offset a null pointer.
123 Tri *offset_c_ptr;
124
125 if (c_ptr == nullptr) {
126 offset_c_ptr = nullptr;
127 } else {
128 offset_c_ptr = c_ptr + m_0 * ldc + n_0;
129 }
130
131 strat.kernel(// A and B pointers are just the packed panels.
132 a_ptr, b_panel,
133 // Provide relevant part of output array and row stride.
134 offset_c_ptr, ldc,
135 // M, N, K sizes
136 m_max-m_0, n_max - n_0, kern_k,
137 // Bias, activation, accumulation. Need to offset the bias as needed.
138 biasptr ? biasptr + n_0 : nullptr, act, accumulate,
139 // Accumulation buffer.
140 acc_buff );
141}
142
143// Run a kernel with integrated merge, quantizing
144template<>
145template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
146void kernel_and_merge<false, Requantize32>::run(
147#ifdef CYCLE_PROFILING
148 profiler &prof,
149#endif
150 strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
151 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
152 unsigned int n_0, unsigned int n_max, const Tr *,
153 const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
154 Tab *acc_buff)
155{
156#ifdef CYCLE_PROFILING
157 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
158#endif
159
160 strat.kernel(// A and B pointers are just the packed panels.
161 a_ptr, b_panel,
162 // Provide relevant part of output array and row stride.
163 c_ptr + m_0 * ldc + n_0, ldc,
164 // M, N, K sizes
165 m_max-m_0, n_max - n_0, kern_k,
166 // Bias, activation, accumulation. Need to offset the bias as needed.
167 col_bias + n_0, qp, n_0, accumulate, acc_buff);
168}
169
170// Run a kernel and call the separate quantize step
171template<>
172template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
173void kernel_and_merge<true, Requantize32>::run(
174#ifdef CYCLE_PROFILING
175 profiler &prof,
176#endif
177 strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
178 Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
179 unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
180 const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
181 Tab *)
182{
183 const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
184
185 {
186#ifdef CYCLE_PROFILING
187 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
188#endif
189
190 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
191 }
192
193 {
194#ifdef CYCLE_PROFILING
195 auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
196#endif
197 // The interleaved kernel outputs in blocks - each block is a
198 // row-major matrix of size out_width * out_height. The merge
199 // kernels are designed to deal with this but the requantizer is
200 // not, so we need to requantize one block at a time.
201 for (int i=0; i<bblocks; i++) {
202 unsigned int n_start = n_0 + (strategy::out_width() * i);
203 unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
204
205 // The row bias is interleaved with the transposed A data, get a pointer to it here.
206 const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
207
208 requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
209 c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
210 c_ptr + m_0 * ldc + n_start, ldc,
211 row_bias, col_bias + n_start, n_start);
212 }
213 }
214}
215
216// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
217// "requantizing" context where the output will be requantized.
218//
219// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
220// if we are not we don't.
221//
222// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
223// quantization to define useless "quantized" transforms.
224template<typename strategy, bool quantized>
225class transform_type {
226public:
227 typedef decltype(strategy::transforms) type;
228};
229
230template<typename strategy>
231class transform_type<strategy, true> {
232public:
233 typedef decltype(strategy::transforms_quantized) type;
234};
235
236// We need a similar trick here to figure out what type the accumulator buffer should be.
237template<typename strategy, typename OutputStage>
238class accumulate_buffer_type {
239public:
240 typedef typename strategy::result_type type;
241};
242
243template<typename strategy>
244class accumulate_buffer_type<strategy, Requantize32> {
245public:
246 typedef int32_t type;
247};
248
249} // anonymous namespace
250
251template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
Anthony Barbier5f707732018-07-03 16:22:02 +0100252class GemmInterleaved : public GemmCommon<To, Tr> {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000253 typedef typename strategy::operand_type Toi;
Anthony Barbier5f707732018-07-03 16:22:02 +0100254 typedef typename strategy::result_type Tri;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000255 typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000256
257 /* const properties set by constructor */
Anthony Barbier5f707732018-07-03 16:22:02 +0100258 const CPUInfo * const _ci;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000259
260 const unsigned int _Msize;
261 const unsigned int _Nsize;
262 const unsigned int _Ksize;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000263 const unsigned int _Ksections;
264 const unsigned int _Ktotal;
265 const unsigned int _rounded_Ksize;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000266
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100267 const unsigned int _nbatches;
268 const unsigned int _nmulti;
269
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000270 const bool _thread_columns;
271
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100272 const Activation _act;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000273
Anthony Barbier5f707732018-07-03 16:22:02 +0100274 const int _maxthreads;
275 int _nthreads;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000276
277 /* Blocking info */
Anthony Barbier5f707732018-07-03 16:22:02 +0100278 unsigned int _k_block=0;
279 unsigned int _x_block=0;
280 unsigned int _Mround=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000281
282 /* Working space, pretransposed buffer, buffer manager */
Anthony Barbier5f707732018-07-03 16:22:02 +0100283 const Toi *_B_transposed=nullptr;
Anthony Barbier5f707732018-07-03 16:22:02 +0100284 void *_working_space=nullptr;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000285
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000286 Tab *_accumulation_buffer=nullptr;
287
288 /* Output stage */
289 OutputStage _os;
290
291 /* Quantized support (in addition to 'output stage' above */
292 int32_t *col_bias = nullptr;
293
294 /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
295 const To * const * const * _indirect_buf = nullptr;
296
297 /* Convolver - only set up for convolution problems, so also doubles as a flag. */
298 std::unique_ptr<convolver<To>> _convolver = nullptr;
299
300 unsigned int get_col_sum_size() const {
301 if (std::is_same<OutputStage, Requantize32>::value) {
302 return _Nsize * _nmulti * sizeof(int32_t);
303 } else {
304 return 0;
305 }
306 }
307
Pablo Telloeb82fd22018-02-23 13:43:50 +0000308 /* We will need to walk through the blocks of B in a few contexts, so
309 * factor that out. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100310 class blockwalker {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000311 private:
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100312 /* Size loops, etc. based on our parent's configuration */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000313 const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000314
Anthony Barbier5f707732018-07-03 16:22:02 +0100315 /* K, X and multi parameters for current iteration. */
316 unsigned int _k0=0, _x0=0, _multi=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000317
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000318 /* Range of X to iterate over - used in "ForceThreadColumns" cases */
319 unsigned int _x_start=0;
320 unsigned int _x_end=_parent._Nsize;
321
Anthony Barbier5f707732018-07-03 16:22:02 +0100322 unsigned int _index=0;
323 bool _done=false;
324 bool _newkblock=true;
325 bool _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000326
327 public:
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000328 blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
329
330 blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
331 unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000332
Anthony Barbier5f707732018-07-03 16:22:02 +0100333 unsigned int xmax() {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000334 return std::min(_x0 + _parent._x_block, _x_end);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000335 }
336
Anthony Barbier5f707732018-07-03 16:22:02 +0100337 unsigned int kmax() {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000338 return std::min(_k0 + _parent._k_block, _parent._Ktotal);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000339 }
340
341 /* Advance to the next block, return false at the end. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100342 bool advance(void) {
343 if (_done) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000344 return false;
345 }
346
Anthony Barbier5f707732018-07-03 16:22:02 +0100347 _newkblock=false;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100348 _x0 += _parent._x_block;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000349 if (_x0 >= _x_end) {
350 _x0=_x_start;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100351 _k0 += _parent._k_block;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000352 if (_k0 >= _parent._Ktotal) {
Anthony Barbier5f707732018-07-03 16:22:02 +0100353 _k0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100354 _multi++;
Anthony Barbier5f707732018-07-03 16:22:02 +0100355 if (_multi >= _parent._nmulti) {
356 _done=true;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100357 return false;
358 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100359 _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000360 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100361 _newkblock=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000362 }
363 _index++;
364
365 return true;
366 }
367
Anthony Barbier5f707732018-07-03 16:22:02 +0100368 unsigned int k0(void) { return _k0; }
369 unsigned int x0(void) { return _x0; }
370 unsigned int multi(void) { return _multi; }
371 unsigned int index(void) { return _index; }
372 bool done(void) { return _done; }
373 bool newkblock(void) { return _newkblock; }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000374 };
375
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000376 // "k block" has two distinct uses: figuring out which iterations of K
377 // to actually process, but also various size/pointer computations. The
378 // latter needs to take account of the extra space needed for the row
379 // sums, if appropriate.
380 unsigned int get_total_k_depth() const {
381 unsigned int k_depth = _k_block;
382
383 if (std::is_same<OutputStage, Requantize32>::value) {
384 k_depth += sizeof(int32_t) / sizeof(Toi);
385 }
386
387 return k_depth;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000388 }
389
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000390 // A working size.
391 size_t get_a_working_size() const {
392 if (_thread_columns) {
393 // For 2D threading: allocate a buffer of one block of rows per thread
394 return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
395 } else {
396 // For 1D threaded: one of these needed, regardless of thread count. Divided according to window.
397 return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
398 }
399 }
400
401 // C working size: One needed per thread. Not needed if there is no merge step.
Anthony Barbier5f707732018-07-03 16:22:02 +0100402 size_t get_c_working_size() const {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000403 if (MergeStep) {
404 return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
405 } else {
406 return 0;
407 }
408 }
409
410 // Accumulation buffer size
411 size_t get_accumulation_buffer_size() const {
412 // We only support an accumulation buffer for non-merge cases.
413 if (MergeStep) {
414 return 0;
415 }
416
417 // Check if we are actually blocking
418 if (_k_block == _Ktotal) {
419 return 0;
420 }
421
422 // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
423 size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
424 size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
425
426 return num_buffers * size_per_buffer;
427 }
428
429 // Get pointer into accumulation buffer
430 Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
431 // Don't do anything if there's no buffer.
432 if (_accumulation_buffer == nullptr) {
433 return nullptr;
434 }
435
436 // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
437 size_t size_per_buffer = strategy::out_height() * strategy::out_width();
438
439 size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
440 size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
441 size_t buffers_per_batch = (buffer_rows * buffer_cols);
442 size_t buffers_per_multi = buffers_per_batch * _nbatches;
443
444 // M/N must reference the top-left corner of a block.
445 size_t row = M / strategy::out_height();
446 assert(M % strategy::out_height() == 0);
447 size_t col = N / strategy::out_width();
448 assert(N % strategy::out_width() == 0);
449
450 size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
451
452 return _accumulation_buffer + (buffer_index * size_per_buffer);
453 }
454
455 int32_t row_sum_multiplier() const {
456 if (std::is_same<OutputStage, Requantize32>::value) {
457 const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
458
459 return -qp->b_offset;
460 }
461
462 return 0;
463 }
464
465 // Heuristics to decide whether to use the 'thread columns' regime
466 static bool is_thread_columns(const GemmArgs &args) {
467 // For now, there is a templace parameter to force it.
468 if (ForceThreadColumns) {
469 return true;
470 }
471
472 // Never do this for single threaded cases.
473 if (args._maxthreads == 1) {
474 return false;
475 }
476
477 // How many blocks of work are available for threading on M?
478 int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
479
480 // If we just can't share the work across threads with the row threading regime.
481 if (args._maxthreads > m_blocks) {
482 return true;
483 }
484
485 // If the row threading regime is too wasteful (20% threshold)
486 if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
487 return true;
488 }
489
490 return false;
491 }
492
493 static unsigned int get_ktotal(const GemmArgs &args) {
494 return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000495 }
496
David Mansell318c9f42020-07-08 13:28:45 +0100497 static unsigned int get_k_block_size(const GemmArgs &args) {
498 if (args._cfg && args._cfg->inner_block_size) {
499 return args._cfg->inner_block_size;
500 }
501
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000502 // K blocking not supported if we are requantizing.
503 if (std::is_same<OutputStage, Requantize32>::value) {
504 return get_ktotal(args);
505 }
506
David Mansell318c9f42020-07-08 13:28:45 +0100507 const unsigned int L1_size = args._ci->get_L1_cache_size();
508 unsigned int k_block;
509
510 // k_block: Find out how much of the larger array can be loaded into half the cache.
511 // This should account for associative caches.
512 k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
513
514 // Needs to be (at least a single) multiple of the K unroll level.
515 k_block /= strategy::k_unroll();
516 k_block = std::max(k_block, 1U) * strategy::k_unroll();
517
518 // Now tune to presented problem size; this is how many blocks we need.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000519 unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
David Mansell318c9f42020-07-08 13:28:45 +0100520
521 // So divide the space equally into that many blocks.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000522 k_block = iceildiv(get_ktotal(args), num_k_blocks);
David Mansell318c9f42020-07-08 13:28:45 +0100523
524 // And round UP to the K unroll level required.
525 k_block = roundup(k_block, strategy::k_unroll());
526
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000527 assert(k_block > 0);
528
David Mansell318c9f42020-07-08 13:28:45 +0100529 return k_block;
530 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000531
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000532 static unsigned int get_x_block_size(const GemmArgs &args) {
533 if (is_thread_columns(args)) {
534 // In 2D mode, override X block, because we will process width first.
535 return roundup(args._Nsize, strategy::out_width());
536 }
537
538 if (args._cfg && args._cfg->outer_block_size) {
539 return roundup(args._cfg->outer_block_size, strategy::out_width());
540 }
541
542 unsigned int x_block;
543 const unsigned int L2_size = args._ci->get_L2_cache_size();
544 const unsigned int k_block = get_k_block_size(args);
545
546 // x_block: Work out how many rows (of length k_block) will fit in the L2
547 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
548 const unsigned int scaled_l2_size = (L2_size * 9) / 10;
549 const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
550
551 // .. if the L1 contents is bigger than the L2, just return a minimal size block.
552 if (k_block_area > scaled_l2_size) {
553 return strategy::out_width();
554 }
555
556 x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
557
558 // Needs to be (at least a single) multiple of the kernel output width.
559 x_block /= strategy::out_width();
560 x_block = std::max(x_block, 1u) * strategy::out_width();
561
562 // And tune to the presented problem size.
563 unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
564 x_block = iceildiv(args._Nsize, num_x_blocks);
565
566 x_block = roundup(x_block, strategy::out_width());
567
568 assert(x_block > 0);
569
570 return x_block;
571 }
572
Pablo Telloeb82fd22018-02-23 13:43:50 +0000573public:
574 GemmInterleaved(GemmInterleaved &) = delete;
Anthony Barbier5f707732018-07-03 16:22:02 +0100575 GemmInterleaved & operator= (GemmInterleaved &) = delete;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000576
577 /* Constructor */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000578 GemmInterleaved(const GemmArgs &args, const OutputStage &os)
579 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
580 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
581 _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
582 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
583 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
584 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
585 _os(os) { }
586
587 /* Constructor without OutputStage */
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100588 GemmInterleaved(const GemmArgs &args)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100589 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000590 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
591 _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
592 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
David Mansell318c9f42020-07-08 13:28:45 +0100593 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000594 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
595 _os() { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000596
597 // Interface implementation - Compulsory functions
598
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100599 // Window size: Only the last thread should do a ragged block, so dole
600 // out work in units of out_height. Factor batches into the window, but
601 // not multi for now (as this would cause problems with the buffer
602 // manager).
Joseph Dobson6f8b17d2020-02-11 19:32:11 +0000603 ndrange_t get_window_size() const override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000604 unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
605
606 if (_thread_columns) {
607 return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
608 } else {
609 // _Mround is a multiple of out_height by definition.
610 return { row_blocks };
611 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000612 }
613
614 // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
Anthony Barbier5f707732018-07-03 16:22:02 +0100615 void set_nthreads(int nthreads) override {
616 _nthreads = std::min(nthreads, _maxthreads);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000617 }
618
619 // Execute
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100620 void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100621#ifdef CYCLE_PROFILING
622 profiler prof;
623#endif
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100624
625 /* Make sure we've been set up correctly. */
626 assert(_B_transposed);
627 assert(_working_space);
628 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
629
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000630 /* Align if needed */
631 intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
632 if (working_space_v & 0x3f) {
633 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
634 working_space_bytes += alignment_offset;
635 }
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100636
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000637 strategy strat(_ci);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100638
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000639 const auto start = work_range.get_position(0);
640 const auto end = work_range.get_position_end(0);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100641
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000642 /* Translate 'start' and 'end' into a position within the batches and rows. */
643 const unsigned int window_per_batch = _Mround / strategy::out_height();
644 unsigned int batch_0 = start / window_per_batch;
645 unsigned int batch_end = end / window_per_batch;
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100646
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000647 // In ThreadColumns mode, process work one horizontal strip at a time.
648 // Transpose the block of needed rows at the start, then do all the work on that block.
649 if (_thread_columns) {
650 const auto start_x = work_range.get_position(1) * strategy::out_width();
651 const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
652
653 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
654 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
655 (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
656
657 for (unsigned int multi=0; multi<_nmulti; multi++) {
658 for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
659 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
660
661 unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
662
663 const bool first_pass = (k0==0);
664 const bool last_pass = (kmax==_Ktotal);
665
666 // Figure out how many "K" the kernel will actually process.
667 unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
668
669 const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
670
671 unsigned int batch = batch_0;
672 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
673
674 for (unsigned int p=start; p<end; p++) {
675 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
676
677 // Set up transposed 'A' block
678 {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100679#ifdef CYCLE_PROFILING
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000680 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100681#endif
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000682 // See comment above on transform_type<> class: this extracts either 'transforms' or
683 // 'transforms_quantized' as appropriate.
684 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
685
686 if (_indirect_buf != nullptr) {
687 transforms.PrepareA_indirect(a_panel,
688 _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
689 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
690 } else if (_convolver) {
691 transforms.PrepareA_convolution(a_panel,
692 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
693 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
694 } else {
695 transforms.PrepareA(a_panel,
696 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
697 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
698 }
699 }
700
701 // Perform the kernel and merge step, either separately or together as required.
702 kernel_and_merge<MergeStep, OutputStage>::run(
703 #ifdef CYCLE_PROFILING
704 prof,
705 #endif
706 // Strategy and panel pointers
707 strat, a_panel, b_ptr, c_panel,
708 // Result buffer pointers
709 this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
710 // K size, and M/N ranges
711 kern_k, start_row, end_row, start_x, end_x,
712 // Only do bias on the first pass
713 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
714 // Only do activation on the last pass, and accumulation on any non-first pass.
715 (last_pass ? _act : Activation()), !first_pass,
716 // Pass in quantization parameters for requantizing kernels (others will ignore)
717 _os, col_bias + (multi * _Nsize),
718 // Accumulation buffer (not yet implemented on this path)
719 static_cast<Tab *>(nullptr));
720
721 /* Increment to the next block */
722 start_row += strategy::out_height();
723 if (start_row >= _Msize) {
724 start_row = 0;
725 batch++;
726 }
727 }
728 }
729 }
730 } else {
731 blockwalker current(*this);
732
733 /* Compute the M values to operate on */
734 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
735 unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
736
737 // Private buffers. Treat working_space as an array of C buffers
738 // (one per thread) first, followed by the (window-divided) A
739 // buffer.
740 // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
741 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
742 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
743
744 const Toi *b_panel;
745 b_panel = _B_transposed;
746
747 // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
748
749 // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
750 // stride of the A panel (i.e. with 4 added for cases with embedded row sums)
751
752 // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
753 // used for addressing inside a_panel.
754
755 // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
756 // won't use all the memory allocated.
757 unsigned int kern_k = 0;
758 unsigned int a_panel_stride = 0;
759
760 for (;!current.done();current.advance()) {
761 if (current.newkblock()) {
762#ifdef CYCLE_PROFILING
763 auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
764#endif
765 // See comment above on transform_type<> class: this extracts either 'transforms' or
766 // 'transforms_quantized' as appropriate.
767 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
768
769 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
770 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
771 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
772
773 if (first_m >= last_m)
774 continue;
775
776 if (_indirect_buf != nullptr) {
777 transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
778 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
779 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
780 } else if (_convolver) {
781 transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
782 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
783 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
784 } else {
785 transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
786 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
787 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
788 }
789 }
790
791 // Figure out how many "K" the kernel will actually process.
792 kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
793
794 // Requantizing GEMMs have the row sums built in to the
795 // transposed data, so the stride between rows is 4 bytes
796 // larger than the (rounded) K value.
797
798 if(std::is_same<OutputStage, Requantize32>::value) {
799 a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
800 } else {
801 a_panel_stride = kern_k;
802 }
803 }
804
805 /* Do the actual work. */
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100806 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
807 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
808 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
809
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000810 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
811
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100812 if (first_m >= last_m)
813 continue;
814
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000815 // For the merge case we need to do this out_height() rows
816 // at a time, as that is the size of our intermediate
817 // buffer. If we are not doing that, we can do all the
818 // relevant rows in one go.
819 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100820
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000821 // But in the case where we have an accumulation buffer, we can't do that after all, unless
822 // there is no N blocking.
823 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
824 m_step = strategy::out_height();
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100825 }
826
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000827 for (unsigned int y=first_m; y<last_m; y+=m_step) {
828 unsigned int ymax = std::min(_Msize, y + m_step);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100829
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000830 const bool first_pass = (current.k0() == 0);
831 const bool last_pass = (current.kmax() == _Ktotal);
832
833 // Pointer to appropriate part of result array.
834 Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
835
836 // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
837 // to write things into the accumulation buffer instead, except on the last pass.
838 if (_accumulation_buffer && !last_pass) {
839 result_ptr = nullptr;
840 }
841
842 // Perform the kernel and merge step, either separately or together as required.
843 kernel_and_merge<MergeStep, OutputStage>::run(
844 #ifdef CYCLE_PROFILING
845 prof,
846 #endif
847 // Strategy and panel pointers
848 strat, a_ptr, b_panel, c_panel,
849 // Result buffer pointers
850 result_ptr, this->_ldc,
851 // K size, and M/N ranges
852 kern_k, y, ymax, current.x0(), current.xmax(),
853 // Only do bias on the first pass
854 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
855 // Only do activation on the last pass, and accumulation on any non-first pass.
856 (last_pass ? _act : Activation()), !first_pass,
857 // Pass in quantization parameters for requantizing kernels (others will ignore)
858 _os, col_bias + (current.multi() * _Nsize),
859 // Accumulation buffer
860 get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
861
862 a_ptr += (strategy::out_height() * a_panel_stride);
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100863 }
864 }
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100865
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000866 b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
867 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000868 }
869 }
870
871 // Interface implementation - working space
Anthony Barbier5f707732018-07-03 16:22:02 +0100872 size_t get_working_size() const override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000873 // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
874 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000875
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000876 size += 128; // Add on two cache lines extra for alignment.
Pablo Telloeb82fd22018-02-23 13:43:50 +0000877
878 return size;
879 }
880
Anthony Barbier5f707732018-07-03 16:22:02 +0100881 void set_working_space(void *working_space) override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000882 // Make sure everything ends up cache line aligned
883 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
Anthony Barbier5f707732018-07-03 16:22:02 +0100884 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000885
Anthony Barbier5f707732018-07-03 16:22:02 +0100886 size_t diff=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000887
Anthony Barbier5f707732018-07-03 16:22:02 +0100888 if (working_space_int & 0x3F) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000889 diff = 0x40 - (working_space_int & 0x3F);
890 }
891
892 working_space_bytes += diff;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000893 working_space_int += diff;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000894
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100895 // Pretransposed case: just set internal pointer to parameter value.
896 _working_space = reinterpret_cast<void *>(working_space_bytes);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000897
898 // Set up accumulation buffer
899 if (get_accumulation_buffer_size() > 0) {
900 intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
901 // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
902 if (acc_buff_int & 0x3F) {
903 acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
904 }
905 _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
906 } else {
907 _accumulation_buffer = nullptr;
908 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000909 }
910
911 // Interface implementation - pretransposed
Anthony Barbier5f707732018-07-03 16:22:02 +0100912 bool B_is_pretransposed() const override {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100913 return true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000914 }
915
Anthony Barbier5f707732018-07-03 16:22:02 +0100916 bool B_pretranspose_required() const override {
Georgios Pinitas0cc50ed2020-07-06 19:10:38 +0100917 return (_B_transposed==nullptr);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000918 }
919
Anthony Barbier5f707732018-07-03 16:22:02 +0100920 size_t get_B_pretransposed_array_size() const override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000921 unsigned int x_size = roundup(_Nsize, strategy::out_width());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000922
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000923 return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000924 }
925
Anthony Barbier5f707732018-07-03 16:22:02 +0100926 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000927 if (std::is_same<OutputStage, Requantize32>::value) {
928 col_bias = reinterpret_cast<int32_t *>(in_buffer);
929
930 Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
931
932 for (unsigned int i=0; i<_nmulti; i++) {
933 // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
934 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
935 }
936 }
937
938 // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
939 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
940 Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
Anthony Barbier5f707732018-07-03 16:22:02 +0100941 _B_transposed = buffer;
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000942
943 blockwalker current(*this);
David Manselld93991e2018-07-06 14:52:52 +0100944 strategy strat(_ci);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000945
Anthony Barbier5f707732018-07-03 16:22:02 +0100946 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000947 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +0000948 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000949
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000950 // We need to insert padding at the end of each K section.
951 // The computation needed is a little delicate - the coordinates from the block walker are expressed in
952 // terms of the full, padded, _Ktotal.
953 // But we need to transform each section with reference to the original, unpadded, input, letting the
954 // transform pad each section as needed.
Pablo Telloeb82fd22018-02-23 13:43:50 +0000955
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000956 // This is needed for computations below.
957 const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000958
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000959 // The expected output format is also an entire <out_width> columns interleaved, then the next set of
960 // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
961 // a time.
962 for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
963 unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000964
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000965 // Track where we are and how much work is left.
966 unsigned int kpos = current.k0();
967 unsigned int kleft = k_size;
968
969 while (kleft) {
970 // Which section are we in? Based on the rounded-up section size.
971 unsigned int k_section_base = kpos / rounded_section_size;
972 // How far into the section are we?
973 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
974
975 // We will either copy the rest of this section, or to the end of the requested length.
976 unsigned int k_length = std::min(_Ksize - k_offset, kleft);
977
978 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
979 x0, xmax,
980 (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
981 (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
982
983 // We need to modify our position based on the ROUNDED version of what we just did.
984 unsigned int padded_length = roundup(k_length, strategy::k_unroll());
985
986 buffer += strategy::out_width() * padded_length;
987
988 kpos += padded_length;
989 kleft -= padded_length;
990 }
991 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100992 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000993 }
994
Anthony Barbier5f707732018-07-03 16:22:02 +0100995 void set_pretransposed_B_data(void *in_buffer) override {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000996 // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
997 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
998 _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
999 col_bias = reinterpret_cast<int32_t *>(in_buffer);
1000 }
1001
1002 void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
1003 if (std::is_same<OutputStage, Requantize32>::value) {
1004 Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
1005
1006 qp->bias = bias;
1007 qp->bias_multi_stride = bias_multi_stride;
1008 }
1009 }
1010
1011 void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
1012 assert(string_len == _Ksize);
1013 _indirect_buf = ptr;
1014 }
1015
1016 void set_convolution_parameters(ConvolutionParameters parms) override {
1017 assert(parms.input_channels == _Ksize);
1018 _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
Michalis Spyroue7e96e02018-04-13 13:44:10 +01001019 }
David Mansell318c9f42020-07-08 13:28:45 +01001020
1021 // Estimate cycles for given problem given provided parameters
1022 static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
1023 unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
1024
Georgios Pinitas6f45cf72021-02-23 23:41:40 +00001025 uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
1026 uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi);
David Mansell318c9f42020-07-08 13:28:45 +01001027 uint64_t merge_bytes = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
1028
1029 float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
1030 float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
1031 float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
1032
1033 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1034
1035 // We can't thread over multis or width, which makes this a poor
1036 // choice in many threaded cases. Penalize that here.
1037 float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
1038
1039 if (parallelism_available < args._maxthreads) {
1040 total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
1041 }
1042
1043 return static_cast<uint64_t>(total_cycles);
1044 }
Pablo Telloeb82fd22018-02-23 13:43:50 +00001045};
1046
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001047// Aliases for the variations
1048template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
1049using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
1050
1051template<typename strategy, typename To, typename Tr>
1052using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
1053
1054template<typename strategy, typename To, typename Tr>
1055using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
1056
Pablo Telloeb82fd22018-02-23 13:43:50 +00001057} // namespace arm_gemm