blob: efd984561d1586f496c23c1bf997092d578f80f4 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Georgios Pinitas7cd26d42019-01-09 18:35:17 +00002 * Copyright (c) 2017-2019 ARM Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
Pablo Telloeb82fd22018-02-23 13:43:50 +000026#include <stdio.h>
Anthony Barbier5f707732018-07-03 16:22:02 +010027#include <assert.h>
Pablo Telloeb82fd22018-02-23 13:43:50 +000028
29#include <algorithm>
30
31#include "arm_gemm.hpp"
32#include "utils.hpp"
33
34#include "buffer_manager.hpp"
35#include "mergeresults.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000036#include "transform.hpp"
37
Michalis Spyroue7e96e02018-04-13 13:44:10 +010038#ifdef CYCLE_PROFILING
39#include "profiler.hpp"
40#endif
41
Pablo Telloeb82fd22018-02-23 13:43:50 +000042// Some macros used to decide how much working space to allocate.
43// Round allocations up to the next cache line.
Anthony Barbier5f707732018-07-03 16:22:02 +010044#define ALLOC_ROUND 64
45#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
Pablo Telloeb82fd22018-02-23 13:43:50 +000046
47// Implementation of the GemmCommon abstract class.
48//
49// This implementation interleaves the source matrices in blocks - good for
50// larger matrices.
Anthony Barbier5f707732018-07-03 16:22:02 +010051namespace arm_gemm {
52
53template<typename strategy, typename To, typename Tr>
54class GemmInterleaved : public GemmCommon<To, Tr> {
Pablo Telloeb82fd22018-02-23 13:43:50 +000055 typedef typename strategy::operand_type Toi;
Anthony Barbier5f707732018-07-03 16:22:02 +010056 typedef typename strategy::result_type Tri;
Pablo Telloeb82fd22018-02-23 13:43:50 +000057
58 /* const properties set by constructor */
Anthony Barbier5f707732018-07-03 16:22:02 +010059 const CPUInfo * const _ci;
Pablo Telloeb82fd22018-02-23 13:43:50 +000060
61 const unsigned int _Msize;
62 const unsigned int _Nsize;
63 const unsigned int _Ksize;
64
Michalis Spyroue7e96e02018-04-13 13:44:10 +010065 const unsigned int _nbatches;
66 const unsigned int _nmulti;
67
Pablo Telloeb82fd22018-02-23 13:43:50 +000068 const bool _trA;
69 const bool _trB;
70
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010071 const Activation _act;
Pablo Telloeb82fd22018-02-23 13:43:50 +000072
Anthony Barbier5f707732018-07-03 16:22:02 +010073 const int _maxthreads;
74 int _nthreads;
75 const bool _pretransposed;
Pablo Telloeb82fd22018-02-23 13:43:50 +000076
77 /* Blocking info */
Anthony Barbier5f707732018-07-03 16:22:02 +010078 unsigned int _k_block=0;
79 unsigned int _x_block=0;
80 unsigned int _Mround=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +000081
82 /* Working space, pretransposed buffer, buffer manager */
Anthony Barbier5f707732018-07-03 16:22:02 +010083 const Toi *_B_transposed=nullptr;
84 BufferManager *_bm=nullptr;
85 void *_working_space=nullptr;
Pablo Telloeb82fd22018-02-23 13:43:50 +000086
87 /* We will need to walk through the blocks of B in a few contexts, so
88 * factor that out. */
Anthony Barbier5f707732018-07-03 16:22:02 +010089 class blockwalker {
Pablo Telloeb82fd22018-02-23 13:43:50 +000090 private:
Michalis Spyroue7e96e02018-04-13 13:44:10 +010091 /* Size loops, etc. based on our parent's configuration */
92 const GemmInterleaved<strategy, To, Tr> &_parent;
Pablo Telloeb82fd22018-02-23 13:43:50 +000093
Anthony Barbier5f707732018-07-03 16:22:02 +010094 /* K, X and multi parameters for current iteration. */
95 unsigned int _k0=0, _x0=0, _multi=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +000096
Anthony Barbier5f707732018-07-03 16:22:02 +010097 unsigned int _index=0;
98 bool _done=false;
99 bool _newkblock=true;
100 bool _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000101
102 public:
Anthony Barbier5f707732018-07-03 16:22:02 +0100103 blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000104
Anthony Barbier5f707732018-07-03 16:22:02 +0100105 unsigned int xmax() {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100106 return std::min(_x0 + _parent._x_block, _parent._Nsize);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000107 }
108
Anthony Barbier5f707732018-07-03 16:22:02 +0100109 unsigned int kmax() {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100110 return std::min(_k0 + _parent._k_block, _parent._Ksize);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000111 }
112
113 /* Advance to the next block, return false at the end. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100114 bool advance(void) {
115 if (_done) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000116 return false;
117 }
118
Anthony Barbier5f707732018-07-03 16:22:02 +0100119 _newkblock=false;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100120 _x0 += _parent._x_block;
Anthony Barbier5f707732018-07-03 16:22:02 +0100121 if (_x0 >= _parent._Nsize) {
122 _x0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100123 _k0 += _parent._k_block;
Anthony Barbier5f707732018-07-03 16:22:02 +0100124 if (_k0 >= _parent._Ksize) {
125 _k0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100126 _multi++;
Anthony Barbier5f707732018-07-03 16:22:02 +0100127 if (_multi >= _parent._nmulti) {
128 _done=true;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100129 return false;
130 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100131 _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000132 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100133 _newkblock=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000134 }
135 _index++;
136
137 return true;
138 }
139
Anthony Barbier5f707732018-07-03 16:22:02 +0100140 unsigned int k0(void) { return _k0; }
141 unsigned int x0(void) { return _x0; }
142 unsigned int multi(void) { return _multi; }
143 unsigned int index(void) { return _index; }
144 bool done(void) { return _done; }
145 bool newkblock(void) { return _newkblock; }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000146 };
147
148 // A working size: One of these needed, regardless of thread count. Divided according to window.
Anthony Barbier5f707732018-07-03 16:22:02 +0100149 size_t get_a_working_size() const {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100150 return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000151 }
152
153 // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
Anthony Barbier5f707732018-07-03 16:22:02 +0100154 size_t get_b_working_size() const {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000155 return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
156 }
157
158 // C working size: One needed per thread.
Anthony Barbier5f707732018-07-03 16:22:02 +0100159 size_t get_c_working_size() const {
David Manselld93991e2018-07-06 14:52:52 +0100160 return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000161 }
162
163 // Internal execute function.
164 // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
Anthony Barbier5f707732018-07-03 16:22:02 +0100165 template<bool pretransposed>
166 void execute_internal(unsigned int start, unsigned int end, int threadid) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100167#ifdef CYCLE_PROFILING
Pablo Telloeb82fd22018-02-23 13:43:50 +0000168 profiler prof;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100169#endif
Pablo Telloeb82fd22018-02-23 13:43:50 +0000170 strategy strat(_ci);
171
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100172 blockwalker current(*this);
Anthony Barbier5f707732018-07-03 16:22:02 +0100173 blockwalker next=current;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000174
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100175 /* Translate 'start' and 'end' into a position within the batches and rows. */
David Manselld93991e2018-07-06 14:52:52 +0100176 const unsigned int window_per_batch = _Mround / strategy::out_height();
Anthony Barbier5f707732018-07-03 16:22:02 +0100177 unsigned int batch_0 = start / window_per_batch;
178 unsigned int batch_end = end / window_per_batch;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100179
Pablo Telloeb82fd22018-02-23 13:43:50 +0000180 /* Compute the M values to operate on */
David Manselld93991e2018-07-06 14:52:52 +0100181 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
182 unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000183
184 /* Make sure we've been set up correctly. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100185 if (pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000186 assert(_B_transposed);
Anthony Barbier5f707732018-07-03 16:22:02 +0100187 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000188 assert(_bm);
189 }
190
191 assert(_working_space);
192 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
193
Anthony Barbier5f707732018-07-03 16:22:02 +0100194 // Private buffers. Treat working_space as an array of C buffers
195 // (one per thread) first, followed by the (window-divided) A
196 // buffer.
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100197 // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
Anthony Barbier5f707732018-07-03 16:22:02 +0100198 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
199 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000200
201 // Shared buffers - these come either from BufferManager or _B_transposed.
202 const Toi *b_panel;
203
Anthony Barbier5f707732018-07-03 16:22:02 +0100204 if (pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000205 b_panel = _B_transposed;
206 }
207
208 //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
209
210 // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
211 int kern_k = 0;
212
Anthony Barbier5f707732018-07-03 16:22:02 +0100213 for (;!current.done();current.advance()) {
214 if (current.newkblock()) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100215#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100216 auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100217#endif
Anthony Barbier5f707732018-07-03 16:22:02 +0100218 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
219 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100220 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
221
Anthony Barbier5f707732018-07-03 16:22:02 +0100222 if (first_m >= last_m)
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100223 continue;
Anthony Barbier5f707732018-07-03 16:22:02 +0100224
David Manselld93991e2018-07-06 14:52:52 +0100225 strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
226 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
227 this->_lda, first_m, last_m, current.k0(), current.kmax(), _trA);
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100228 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000229
230 // Figure out how many "K" the kernel will actually process.
David Manselld93991e2018-07-06 14:52:52 +0100231 kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
232 kern_k *= strat.k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000233 }
234
David Manselld93991e2018-07-06 14:52:52 +0100235 int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000236
Anthony Barbier5f707732018-07-03 16:22:02 +0100237 if (!pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000238 /* Look ahead to the next block and populate it if necessary.
239 * This avoids the populate operation becoming a bottleneck, and
240 * helps keep the threads synchronized (the first thread to get
241 * here will populate while the rest will advance).
242 *
243 * If we are running single threaded, bm->try_populate() will do
244 * nothing.
245 */
Anthony Barbier5f707732018-07-03 16:22:02 +0100246 if (next.advance()) {
247 _bm->try_populate(next.index(), [&](void *buffer) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100248#ifdef CYCLE_PROFILING
Anthony Barbier5f707732018-07-03 16:22:02 +0100249 auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100250#endif
Pablo Telloeb82fd22018-02-23 13:43:50 +0000251
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100252 Toi *b_panel = reinterpret_cast<Toi *>(buffer);
David Manselld93991e2018-07-06 14:52:52 +0100253
254 strat.transforms.PrepareB(b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
255 next.x0(), next.xmax(), next.k0(), next.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000256 });
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100257 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100258
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100259 /* Get the buffer for this iteration from the BufferManager. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100260 b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100261#ifdef CYCLE_PROFILING
Anthony Barbier5f707732018-07-03 16:22:02 +0100262 auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100263#endif
264
265 Toi *b_panel = reinterpret_cast<Toi *>(bpv);
David Manselld93991e2018-07-06 14:52:52 +0100266
267 strat.transforms.PrepareB(b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
268 current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000269 }));
270 }
271
272 /* Do the actual work. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100273 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
274 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100275 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000276
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100277 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
278
Anthony Barbier5f707732018-07-03 16:22:02 +0100279 if (first_m >= last_m)
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100280 continue;
281
David Manselld93991e2018-07-06 14:52:52 +0100282 for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
283 unsigned int ymax = std::min(_Msize, y + strategy::out_height());
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100284
285 {
286#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100287 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100288#endif
289
290 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
291
David Manselld93991e2018-07-06 14:52:52 +0100292 a_ptr += (strategy::out_height() * kern_k);
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100293 }
294
295 {
296#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100297 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100298#endif
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100299 /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
300 const bool first_pass = current.k0()==0;
301 const bool last_pass = current.kmax()==_Ksize;
302
David Manselld93991e2018-07-06 14:52:52 +0100303 strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
304 c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100305 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
306 (last_pass ? _act : Activation()), !first_pass);
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100307 }
308 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000309 }
310
Anthony Barbier5f707732018-07-03 16:22:02 +0100311 if (pretransposed) {
David Manselld93991e2018-07-06 14:52:52 +0100312 b_panel += (bblocks * strat.out_width() * kern_k);
Anthony Barbier5f707732018-07-03 16:22:02 +0100313 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000314 _bm->release(current.index());
315 }
316 }
317 }
318
319public:
320 GemmInterleaved(GemmInterleaved &) = delete;
Anthony Barbier5f707732018-07-03 16:22:02 +0100321 GemmInterleaved & operator= (GemmInterleaved &) = delete;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000322
323 /* Constructor */
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100324 GemmInterleaved(const GemmArgs &args)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100325 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
326 _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100327 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100328 _pretransposed(args._pretransposed_hint) {
David Manselle39334c2018-07-06 17:53:35 +0100329 const unsigned int L1_size = _ci->get_L1_cache_size();
330 const unsigned int L2_size = _ci->get_L2_cache_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000331
David Manselle39334c2018-07-06 17:53:35 +0100332 assert(_maxthreads > 0);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000333
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000334 // Work out blocking parameters, or override from provided GemmConfig
335 if (args._cfg && args._cfg->inner_block_size) {
336 _k_block = args._cfg->inner_block_size;
337 } else {
338 // k_block: Find out how much of the larger array can be loaded into half the cache.
339 // This should account for associative caches.
340 _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000341
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000342 // Needs to be (at least a single) multiple of the K unroll level.
343 _k_block /= strategy::k_unroll();
344 _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000345
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000346 // Now tune to presented problem size; this is how many blocks we need.
Georgios Pinitas1d480652019-01-23 11:24:50 +0000347 unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000348
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000349 // So divide the space equally into that many blocks.
350 _k_block = iceildiv(_Ksize, num_k_blocks);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000351
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000352 // And round UP to the K unroll level required.
353 _k_block = iceildiv(_k_block, strategy::k_unroll());
354 _k_block *= strategy::k_unroll();
355 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000356
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000357 if (args._cfg && args._cfg->outer_block_size) {
358 _x_block = args._cfg->outer_block_size;
359 } else {
360 // x_block: Work out how many rows (of length k_block) will fit in the L2
361 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
362 _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100363 (sizeof(Toi) * _k_block);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000364
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000365 // Needs to be (at least a single) multiple of the kernel output width.
366 _x_block /= strategy::out_width();
367 _x_block = std::max(_x_block, 1U) * strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000368
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000369 // And tune to the presented problem size.
Georgios Pinitas1d480652019-01-23 11:24:50 +0000370 unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000371 _x_block = iceildiv(_Nsize, num_x_blocks);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000372
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000373 _x_block = iceildiv(_x_block, strategy::out_width());
374 _x_block *= strategy::out_width();
375 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000376
377 // Work out the rounded size of M - needed for some buffers.
David Manselle39334c2018-07-06 17:53:35 +0100378 _Mround = iceildiv(_Msize, strategy::out_height());
David Manselld93991e2018-07-06 14:52:52 +0100379 _Mround *= strategy::out_height();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000380 }
381
382 // Interface implementation - Compulsory functions
383
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100384 // Window size: Only the last thread should do a ragged block, so dole
385 // out work in units of out_height. Factor batches into the window, but
386 // not multi for now (as this would cause problems with the buffer
387 // manager).
Anthony Barbier5f707732018-07-03 16:22:02 +0100388 unsigned int get_window_size() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000389 // _Mround is a multiple of out_height by definition.
David Manselld93991e2018-07-06 14:52:52 +0100390 return (_Mround / strategy::out_height()) * _nbatches;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000391 }
392
393 // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
Anthony Barbier5f707732018-07-03 16:22:02 +0100394 void set_nthreads(int nthreads) override {
395 _nthreads = std::min(nthreads, _maxthreads);
396 if (_bm) {
397 _bm->set_nthreads(_nthreads);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000398 }
399 }
400
401 // Execute
Anthony Barbier5f707732018-07-03 16:22:02 +0100402 void execute(unsigned int start, unsigned int end, int threadid) override {
403 if (_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000404 execute_internal<true>(start, end, threadid);
Anthony Barbier5f707732018-07-03 16:22:02 +0100405 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000406 execute_internal<false>(start, end, threadid);
407 }
408 }
409
410 // Interface implementation - working space
Anthony Barbier5f707732018-07-03 16:22:02 +0100411 size_t get_working_size() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000412 // In all cases, we need one A buffer plus a C buffer per thread.
413 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
414
415 // For pretransposed case, there is no working space needed for B.
416 // Otherwise, we need a BufferManager.
Anthony Barbier5f707732018-07-03 16:22:02 +0100417 if (!_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000418 size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
419 }
420
421 size += 64; // Add on a cache line extra for alignment.
422
423 return size;
424 }
425
Anthony Barbier5f707732018-07-03 16:22:02 +0100426 void set_working_space(void *working_space) override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000427 // Make sure everything ends up cache line aligned
428 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
Anthony Barbier5f707732018-07-03 16:22:02 +0100429 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000430
Anthony Barbier5f707732018-07-03 16:22:02 +0100431 size_t diff=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000432
Anthony Barbier5f707732018-07-03 16:22:02 +0100433 if (working_space_int & 0x3F) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000434 diff = 0x40 - (working_space_int & 0x3F);
435 }
436
437 working_space_bytes += diff;
438
Anthony Barbier5f707732018-07-03 16:22:02 +0100439 if (_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000440 // Pretransposed case: just set internal pointer to parameter value.
441 _working_space = reinterpret_cast<void *>(working_space_bytes);
Anthony Barbier5f707732018-07-03 16:22:02 +0100442 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000443 // Otherwise, use the first part of the working space for the buffer manager.
444 // It's legal to call this again so don't leak a buffer manager if it already existed.
445 delete _bm;
446
Anthony Barbier5f707732018-07-03 16:22:02 +0100447 _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000448
449 working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
450
451 _working_space = reinterpret_cast<void *>(working_space_bytes);
452 }
453 }
454
455 // Interface implementation - pretransposed
Anthony Barbier5f707732018-07-03 16:22:02 +0100456 bool B_is_pretransposed() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000457 return _pretransposed;
458 }
459
Anthony Barbier5f707732018-07-03 16:22:02 +0100460 bool B_pretranspose_required() const override {
461 return _pretransposed && (_B_transposed==nullptr);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000462 }
463
464 // TODO: this could almost certainly be considerably simpler.
Anthony Barbier5f707732018-07-03 16:22:02 +0100465 size_t get_B_pretransposed_array_size() const override {
466 size_t total=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100467 blockwalker current(*this);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000468
Anthony Barbier5f707732018-07-03 16:22:02 +0100469 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000470 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +0000471 unsigned int x_size = (current.xmax() - current.x0());
472 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000473
474 /* Round sizes up as needed. */
David Manselld93991e2018-07-06 14:52:52 +0100475 x_size = iceildiv(x_size, strategy::out_width());
476 x_size *= strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000477
David Manselld93991e2018-07-06 14:52:52 +0100478 k_size = iceildiv(k_size, strategy::k_unroll());
479 k_size *= strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000480
481 total += x_size * k_size * sizeof(Toi);
Anthony Barbier5f707732018-07-03 16:22:02 +0100482 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000483
484 return total;
485 }
486
Anthony Barbier5f707732018-07-03 16:22:02 +0100487 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100488 blockwalker current(*this);
Anthony Barbier5f707732018-07-03 16:22:02 +0100489 Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
490 _B_transposed = buffer;
David Manselld93991e2018-07-06 14:52:52 +0100491 strategy strat(_ci);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000492
Anthony Barbier5f707732018-07-03 16:22:02 +0100493 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000494 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +0000495 unsigned int x_size = (current.xmax() - current.x0());
496 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000497
498 /* Round sizes up as needed. */
David Manselld93991e2018-07-06 14:52:52 +0100499 x_size = iceildiv(x_size, strategy::out_width());
500 x_size *= strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000501
David Manselld93991e2018-07-06 14:52:52 +0100502 k_size = iceildiv(k_size, strategy::k_unroll());
503 k_size *= strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000504
David Manselld93991e2018-07-06 14:52:52 +0100505 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
506 current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000507
508 buffer += (x_size * k_size);
Anthony Barbier5f707732018-07-03 16:22:02 +0100509 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000510 }
511
Anthony Barbier5f707732018-07-03 16:22:02 +0100512 void set_pretransposed_B_data(void *in_buffer) override {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100513 _B_transposed = reinterpret_cast<Toi *>(in_buffer);
514 }
515
Anthony Barbier5f707732018-07-03 16:22:02 +0100516 ~GemmInterleaved() override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000517 delete _bm;
518 }
519};
520
521} // namespace arm_gemm