blob: d06c2183d1d17a37019711c7ff3830560fd92a05 [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
2 * Copyright (c) 2017-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#include <alloca.h>
27
28#include <algorithm>
29#include <cassert>
30
31#include "arm_gemm.hpp"
32#include "bias_adder.hpp"
33#include "convolver.hpp"
34#include "ndrange.hpp"
35#include "performance_parameters.hpp"
36#include "transform.hpp"
37#include "utils.hpp"
38
39#ifdef CYCLE_PROFILING
40#include "profiler.hpp"
41#endif
42
43#ifndef UNUSED
44#define __I_DEFINED_UNUSED
45#define UNUSED(x) ((void)(x))
46#endif
47
48namespace arm_gemm {
49
50namespace {
51
52// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
53// that.
54
55template<typename OutputStage, bool SeparateQuantize = false>
56class run_hybrid_kernel {
57public:
58 template<typename strategy, typename To, typename Tr>
59 static void run (
60#ifdef CYCLE_PROFILING
61 profiler &prof,
62#endif
63 const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
64 unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
65 const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
66};
67
68template<>
69template<typename strategy, typename To, typename Tr>
70void run_hybrid_kernel<Nothing, false>::run(
71#ifdef CYCLE_PROFILING
72 profiler &prof,
73#endif
74 const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
75 unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
76 const Nothing &, const int32_t *, unsigned int) {
77#ifdef CYCLE_PROFILING
78 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
79#endif
80 UNUSED(kern_k);
81
82 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
83}
84
85template<>
86template<typename strategy, typename To, typename Tr>
87void run_hybrid_kernel<Requantize32, false>::run(
88#ifdef CYCLE_PROFILING
89 profiler &prof,
90#endif
91 const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
92 unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
93 const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
94#ifdef CYCLE_PROFILING
95 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
96#endif
97 UNUSED(kern_k);
98
99 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
100}
101
102template<>
103template<typename strategy, typename To, typename Tr>
104void run_hybrid_kernel<Requantize32, true>::run(
105#ifdef CYCLE_PROFILING
106 profiler &prof,
107#endif
108 const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
109 unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
110 const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
111 UNUSED(kern_k);
112 // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
113 assert(M <= strategy::out_height());
114 // We don't yet support indirect output (as the quantizer can't do it).
115 assert(output_arg.is_indirect == false);
116
117 // We need a row sum buffer and intermediate output buffer.
118 // These go on the stack as they are not too large, using an automatic array and alloca() respectively.
119 int32_t row_sums[strategy::out_height()];
120 typename strategy::result_type *result_buffer;
121
122 unsigned int output_width = roundup(N, strategy::out_width());
123
124 result_buffer = reinterpret_cast<typename strategy::result_type *>(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type)));
125
126 {
127#ifdef CYCLE_PROFILING
128 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
129#endif
130 // Perform the GEMM, into the output buffer.
131 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
132 }
133
134 if (os.b_offset != 0) {
135#ifdef CYCLE_PROFILING
136 auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
137#endif
138 row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
139 } else {
140 memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
141 }
142
143 {
144#ifdef CYCLE_PROFILING
145 auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
146#endif
147 // Quantize
148 requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
149 }
150}
151
152} // anonymous namespace
153
154// Implementation of the GemmCommon abstract class.
155template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
156class GemmHybridIndirect : public GemmCommon<To, Tr> {
157 typedef typename strategy::operand_type Toi;
158 typedef typename strategy::result_type Tri;
159
160 GemmArgs _args;
161 OutputStage _os = {};
162
163 /* Quantized support (in addition to 'output stage' above) */
164 int32_t *_col_bias = nullptr;
165
166 const unsigned int _Ktotal;
167 const unsigned int _rounded_Ksize;
168
169 /* Blocking info */
170 const unsigned int _k_block;
171 const unsigned int _n_block;
172 const unsigned int _Mround;
173
174 /* Pretransposed buffer. */
175 const Toi *_B_transposed=nullptr;
176
177 /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
178 const To * const * const * _indirect_buf = nullptr;
179
180 /* Convolver - only set up for convolution problems, so also doubles as a flag. */
181 std::unique_ptr<convolver<To>> _convolver = nullptr;
182
183 // Array of pointers to output rows
184// Tr * const * _output_ptrs;
185
186 const NDRange<4> _window_range;
187
188 unsigned int get_col_sum_size() const {
189 if (std::is_same<OutputStage, Requantize32>::value) {
190 return _args._Nsize * _args._nmulti * sizeof(int32_t);
191 } else {
192 return 0;
193 }
194 }
195
196 static unsigned int get_ktotal(const GemmArgs &args) {
197 return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
198 }
199
200 static unsigned int compute_k_block(const GemmArgs &args) {
201 // Some kernels don't support accumulate mode - these can't do K blocking at all.
202 if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
203 return get_ktotal(args);
204 }
205
206 if (args._cfg && args._cfg->inner_block_size) {
207 return args._cfg->inner_block_size;
208 }
209
210 // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
211 // datatypes); but don't divide into blocks until we hit 1.5X this size.
212 unsigned int target_block_size = 2048 / sizeof(To);
213 auto ktotal = get_ktotal(args);
214
215 if (ktotal > ((target_block_size*3)/2)) {
216 unsigned int target_blocks = iceildiv(ktotal, target_block_size);
217
218 unsigned int block_size = iceildiv(ktotal, target_blocks);
219
220 block_size = roundup(block_size, strategy::k_unroll());
221
222 return block_size;
223 }
224
225 return ktotal;
226 }
227
228 // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a
229 // single block.
230 static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
231 if (args._cfg && args._cfg->outer_block_size) {
232 return args._cfg->outer_block_size;
233 }
234
235 if (args._Nsize <= 64) {
236 return args._Nsize;
237 }
238
239 if ((args._Msize / args._Nsize) > 155) {
240 return args._Nsize;
241 }
242
243 // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
244 // use imply a great deal of repeated work performing the row sums. If row sums are involved, work out how
245 // much "column" parallelism is going to be required and set the block size accordingly.
246 if (std::is_same<OutputStage, Requantize32>::value) {
247 const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
248
249 // Row sums only needed if b_offset isn't 0
250 if (qp->b_offset != 0) {
251 // We can already parallelize across batches, multis and rows (in units of 'out_height')
252 int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
253
254 // If this isn't enough, we will need to split up the columns too.
255 if (multi_row_parallelism < args._maxthreads) {
256 unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
257
258 unsigned int n_block = iceildiv(args._Nsize, columns_needed);
259
260 return roundup(n_block, strategy::out_width());
261 }
262
263 // Multi/Batch/Row parallelism is enough - don't split up the columns.
264 return args._Nsize;
265 }
266 }
267
268 if (args._Ksize <= 128 && args._maxthreads <= 16) {
269 return strategy::out_width() * 3;
270 }
271
272 return strategy::out_width();
273 }
274
275public:
276 GemmHybridIndirect(GemmHybridIndirect &) = delete;
277 GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
278
279 /* Constructor */
280 GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
281 : _args(args), _os(os), _Ktotal(get_ktotal(args)),
282 _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
283 _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
284 _Mround(roundup(args._Msize, strategy::out_height())),
285 _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
286 iceildiv(args._Nsize, _n_block), args._nmulti)
287 {
288 // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
289 // GemmConfig. Clear out the pointer to avoid accidents.
290 _args._cfg = nullptr;
291 }
292
293 /* Constructor without OutputStage */
294 GemmHybridIndirect(const GemmArgs &args)
295 : _args(args), _Ktotal(get_ktotal(args)),
296 _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
297 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
298 _Mround(roundup(args._Msize, strategy::out_height())),
299 _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
300 iceildiv(args._Nsize, _n_block), args._nmulti)
301 {
302 // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
303 // GemmConfig. Clear out the pointer to avoid accidents.
304 _args._cfg = nullptr;
305 }
306
307 // Interface implementation - Compulsory functions
308 ndrange_t get_window_size() const override {
309 return { _window_range.total_size() };
310 }
311
312 // This kernel can always be dynamically scheduled.
313 bool supports_dynamic_scheduling() const override {
314 return true;
315 }
316
317 // Execute
318 void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
319#ifdef CYCLE_PROFILING
320 profiler prof;
321#endif
322 strategy strat(_args._ci);
323
324 std::vector<const To *> in_row_ptrs;
325 std::vector<const To * const *> in_row_strings;
326 std::vector<unsigned int> string_lengths;
327
328 // In convolution mode, we need input pointers.
329 if (_convolver) {
Georgios Pinitas85e16c22021-02-23 20:04:42 +0000330 in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
331 in_row_strings.resize(_args._Ksections, nullptr);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000332
333 for (unsigned int i=0; i<_args._Ksections; i++) {
334 in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
335 }
336 }
337
338 // In any indirect mode, we need the string lengths.
339 if (_args._indirect_input) {
340 string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
341 }
342
343 /* Make sure we've been set up correctly. */
344 assert(_B_transposed);
345 static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
346// static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
347
348 /* For now, each work item implies all the K for a given output
349 * pixel (so we don't need to synchronize access to the output
350 * array). So separate the loop over K blocks here. */
351 for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
352 unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
353 unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
354
355 const bool first_pass = (k0 == 0);
356 const bool last_pass = (kmax == _Ktotal);
357
358 unsigned int first_section = (k0 / _rounded_Ksize);
359 unsigned int first_offset = (k0 % _rounded_Ksize);
360 unsigned int kleft = kern_k;
361 unsigned int sections=0;
362 unsigned int offset = first_offset;
363
364 if (_args._indirect_input) {
365 while (kleft) {
366 // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
367 // processed (excluding padding). But the amount we subtract from 'kleft' takes account of any
368 // padding applied.
369 string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
370 kleft -= std::min(kleft, _rounded_Ksize - offset);
371 sections++;
372 offset=0;
373 }
374 }
375
376 auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
377
378 if (p.done()) {
379 return;
380 }
381
382 // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
383 // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
384 // THe convolution path only generates the pointers for one block of rows at a time.
385 const bool process_all_rows = (!SeparateQuantize && !_convolver);
386
387 do {
388 const unsigned int m_start = p.dim(0) * strategy::out_height();
389 const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
390// const unsigned int m_end = std::min(m_start + strategy::out_height(), _args._Msize);
391 const unsigned int batch = p.dim(1);
392 const unsigned int n0 = p.dim(2) * _n_block;
393 const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
394 const unsigned int multi = p.dim(3);
395
396 const Toi *b_panel = _B_transposed +
397 (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
398 (k0 * roundup(_args._Nsize, strategy::out_width())) +
399 (n0 * kern_k);
400
401 IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
402
403#ifdef CYCLE_PROFILING
404 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
405#endif
406 if (_indirect_buf) {
407 run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
408#ifdef CYCLE_PROFILING
409 prof,
410#endif
411 strat, sections, string_lengths.data(),
412 IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
413 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
414 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
415 last_pass ? _args._act : Activation(),
416 !first_pass,
417 // Quantization parameters
418 _os, _col_bias+(multi * _args._Nsize), n0);
419 } else if (_convolver) {
420 auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
421
422 unsigned int pos=0;
423 auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
424
425 while (!conv_rows.finished()) {
426 unsigned int width, conv_offset;
427
428 assert(pos < sections);
429
430 std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
431
432 if (pos==0) {
433 assert(conv_offset == first_offset);
434 }
435 assert(width == string_lengths[pos]);
436 pos++;
437 }
438 assert(pos == sections);
439
440 run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
441#ifdef CYCLE_PROFILING
442 prof,
443#endif
444 strat, sections, string_lengths.data(),
445 IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
446 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
447 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
448 last_pass ? _args._act : Activation(),
449 !first_pass,
450 // Quantization parameters
451 _os, _col_bias+(multi * _args._Nsize), n0);
452 } else {
453 // Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
454 const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
455
456 run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
457#ifdef CYCLE_PROFILING
458 prof,
459#endif
460 strat, 1, &len,
461 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
462 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
463 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
464 last_pass ? _args._act : Activation(),
465 !first_pass,
466 // Quantization parameters
467 _os, _col_bias+(multi * _args._Nsize), n0);
468 }
469 } while (process_all_rows ? p.next_dim1() : p.next_dim0());
470 }
471 }
472
473 // Interface implementation - pretransposed
474 bool B_is_pretransposed() const override {
475 return true;
476 }
477
478 bool B_pretranspose_required() const override {
479 return (_B_transposed==nullptr);
480 }
481
482 size_t get_B_pretransposed_array_size() const override {
483 // Start with actual pretransposed buffer...
484 size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
485
486 // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
487 size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
488
489 if (std::is_same<OutputStage, Requantize32>::value) {
490 size += get_col_sum_size();
491 }
492
493 return size;
494 }
495
496 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
497 if (std::is_same<OutputStage, Requantize32>::value) {
498 _col_bias = reinterpret_cast<int32_t *>(in_buffer);
499
500 Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
501
502 for (unsigned int i=0; i<_args._nmulti; i++) {
503 // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
504 compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
505 }
506 }
507
508 // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
509 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
510 Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
511 _B_transposed = buffer;
512
513 strategy strat(_args._ci);
514
515 for (unsigned int multi=0; multi<_args._nmulti; multi++) {
516 for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
517 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
518
519 /* Figure out the size of each block. */
520 unsigned int k_size = kmax - k0;
521
522 // We need to insert padding at the end of each K section.
523 // The computation needed is a little delicate - the coordinates from the block walker are expressed in
524 // terms of the full, padded, _Ktotal.
525 // But we need to transform each section with reference to the original, unpadded, input, letting the
526 // transform pad each section as needed.
527
528 // This is needed for computations below.
529 const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
530
531 // The expected output format is also an entire <out_width> columns interleaved, then the next set of
532 // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
533 // a time.
534 for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
535 unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
536
537 // Track where we are and how much work is left.
538 unsigned int kpos = k0;
539 unsigned int kleft = k_size;
540
541 while (kleft) {
542 // Which section are we in? Based on the rounded-up section size.
543 unsigned int k_section_base = kpos / rounded_section_size;
544 // How far into the section are we?
545 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
546
547 // We will either copy the rest of this section, or to the end of the requested length.
548 unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
549
550 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
551 x0, xmax,
552 (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
553 (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
554
555 // We need to modify our position based on the ROUNDED version of what we just did.
556 unsigned int padded_length = roundup(k_length, strategy::k_unroll());
557
558 buffer += strategy::out_width() * padded_length;
559
560 kpos += padded_length;
561 kleft -= padded_length;
562 }
563 }
564 }
565 }
566 }
567
568 void set_pretransposed_B_data(void *in_buffer) override {
569 // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
570 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
571 _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
572 _col_bias = reinterpret_cast<int32_t *>(in_buffer);
573 }
574
575 // Estimate cycles for given problem given provided parameters
Georgios Pinitas33e03072021-01-14 13:43:40 +0000576 static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000577 // Note: Current hybrid kernels don't actually round up height (they
578 // have paths for each possible height). Might need to make this
579 // configurable in future.
580 uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
581
582 float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
583
584 // TODO: A bit of a kludge here: current hybrid kernels incur extra
585 // overhead where the width is not a multiple of kernel width. It's
586 // most noticable where the overall width is quite low, so add 15%
587 // penalty for such widths.
588 if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
589 mac_cycles *= 1.15f;
590 }
591
592 uint64_t total_cycles = mac_cycles;
593
Georgios Pinitas33e03072021-01-14 13:43:40 +0000594 // Quantizing kernels with separate quantize need to add in the extra stages.
595 if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
596 const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
597
598 // Row sums: need to consider each value in A (batch * multi * M * K)...
599 uint64_t rowsum_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Ksize, strategy::k_unroll());
600
601 // ... but row sums are skipped if B offset==0.
602 if (qp->b_offset == 0) {
603 rowsum_bytes = 0;
604 }
605
606 // Use "prepare bytes per cycle" to store "row sum values per cycle".
607 float rowsum_cycles = static_cast<float>(rowsum_bytes) / params.prepare_bytes_cycle;
608
609 // Requantize: need to consider each value in C (batch * multi * M * N)
610 uint64_t requantize_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * args._Nsize;
611
612 // Use "merge bytes per cycle" to store "requantize values per cycle".
613 float requantize_cycles = static_cast<float>(requantize_bytes) / params.merge_bytes_cycle;
614
615 // Recalculate total_cycles with the extra components.
616 total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
617 }
618
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000619 return total_cycles;
620 }
621
622 void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
623 if (std::is_same<OutputStage, Requantize32>::value) {
624 Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
625
626 qp->bias = bias;
627 qp->bias_multi_stride = bias_multi_stride;
628 }
629 }
630
631 void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
632 assert(string_len == _args._Ksize);
633 _indirect_buf = ptr;
634 }
635
636 void set_convolution_parameters(ConvolutionParameters parms) override {
637 assert(parms.input_channels == _args._Ksize);
638 _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
639 }
640};
641
642} // namespace arm_gemm
643
644#ifdef __I_DEFINED_UNUSED
645#undef UNUSED
646#endif