blob: faff9acd2e1c724b232805496c0585cea5295003 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Georgios Pinitas7cd26d42019-01-09 18:35:17 +00002 * Copyright (c) 2017-2019 ARM Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
Pablo Telloeb82fd22018-02-23 13:43:50 +000026#include <stdio.h>
Anthony Barbier5f707732018-07-03 16:22:02 +010027#include <assert.h>
Pablo Telloeb82fd22018-02-23 13:43:50 +000028
29#include <algorithm>
30
31#include "arm_gemm.hpp"
32#include "utils.hpp"
33
34#include "buffer_manager.hpp"
35#include "mergeresults.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000036#include "transform.hpp"
37
Michalis Spyroue7e96e02018-04-13 13:44:10 +010038#ifdef CYCLE_PROFILING
39#include "profiler.hpp"
40#endif
41
Pablo Telloeb82fd22018-02-23 13:43:50 +000042// Some macros used to decide how much working space to allocate.
43// Round allocations up to the next cache line.
Anthony Barbier5f707732018-07-03 16:22:02 +010044#define ALLOC_ROUND 64
45#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
Pablo Telloeb82fd22018-02-23 13:43:50 +000046
47// Implementation of the GemmCommon abstract class.
48//
49// This implementation interleaves the source matrices in blocks - good for
50// larger matrices.
Anthony Barbier5f707732018-07-03 16:22:02 +010051namespace arm_gemm {
52
53template<typename strategy, typename To, typename Tr>
54class GemmInterleaved : public GemmCommon<To, Tr> {
Pablo Telloeb82fd22018-02-23 13:43:50 +000055 typedef typename strategy::operand_type Toi;
Anthony Barbier5f707732018-07-03 16:22:02 +010056 typedef typename strategy::result_type Tri;
Pablo Telloeb82fd22018-02-23 13:43:50 +000057
58 /* const properties set by constructor */
Anthony Barbier5f707732018-07-03 16:22:02 +010059 const CPUInfo * const _ci;
Pablo Telloeb82fd22018-02-23 13:43:50 +000060
61 const unsigned int _Msize;
62 const unsigned int _Nsize;
63 const unsigned int _Ksize;
64
Michalis Spyroue7e96e02018-04-13 13:44:10 +010065 const unsigned int _nbatches;
66 const unsigned int _nmulti;
67
Pablo Telloeb82fd22018-02-23 13:43:50 +000068 const bool _trA;
69 const bool _trB;
70
71 const Tr _alpha;
72 const Tr _beta;
73
Anthony Barbier5f707732018-07-03 16:22:02 +010074 const int _maxthreads;
75 int _nthreads;
76 const bool _pretransposed;
Pablo Telloeb82fd22018-02-23 13:43:50 +000077
78 /* Blocking info */
Anthony Barbier5f707732018-07-03 16:22:02 +010079 unsigned int _k_block=0;
80 unsigned int _x_block=0;
81 unsigned int _Mround=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +000082
83 /* Working space, pretransposed buffer, buffer manager */
Anthony Barbier5f707732018-07-03 16:22:02 +010084 const Toi *_B_transposed=nullptr;
85 BufferManager *_bm=nullptr;
86 void *_working_space=nullptr;
Pablo Telloeb82fd22018-02-23 13:43:50 +000087
88 /* We will need to walk through the blocks of B in a few contexts, so
89 * factor that out. */
Anthony Barbier5f707732018-07-03 16:22:02 +010090 class blockwalker {
Pablo Telloeb82fd22018-02-23 13:43:50 +000091 private:
Michalis Spyroue7e96e02018-04-13 13:44:10 +010092 /* Size loops, etc. based on our parent's configuration */
93 const GemmInterleaved<strategy, To, Tr> &_parent;
Pablo Telloeb82fd22018-02-23 13:43:50 +000094
Anthony Barbier5f707732018-07-03 16:22:02 +010095 /* K, X and multi parameters for current iteration. */
96 unsigned int _k0=0, _x0=0, _multi=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +000097
Anthony Barbier5f707732018-07-03 16:22:02 +010098 unsigned int _index=0;
99 bool _done=false;
100 bool _newkblock=true;
101 bool _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000102
103 public:
Anthony Barbier5f707732018-07-03 16:22:02 +0100104 blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000105
Anthony Barbier5f707732018-07-03 16:22:02 +0100106 unsigned int xmax() {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100107 return std::min(_x0 + _parent._x_block, _parent._Nsize);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000108 }
109
Anthony Barbier5f707732018-07-03 16:22:02 +0100110 unsigned int kmax() {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100111 return std::min(_k0 + _parent._k_block, _parent._Ksize);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000112 }
113
114 /* Advance to the next block, return false at the end. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100115 bool advance(void) {
116 if (_done) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000117 return false;
118 }
119
Anthony Barbier5f707732018-07-03 16:22:02 +0100120 _newkblock=false;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100121 _x0 += _parent._x_block;
Anthony Barbier5f707732018-07-03 16:22:02 +0100122 if (_x0 >= _parent._Nsize) {
123 _x0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100124 _k0 += _parent._k_block;
Anthony Barbier5f707732018-07-03 16:22:02 +0100125 if (_k0 >= _parent._Ksize) {
126 _k0=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100127 _multi++;
Anthony Barbier5f707732018-07-03 16:22:02 +0100128 if (_multi >= _parent._nmulti) {
129 _done=true;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100130 return false;
131 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100132 _newmulti=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000133 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100134 _newkblock=true;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000135 }
136 _index++;
137
138 return true;
139 }
140
Anthony Barbier5f707732018-07-03 16:22:02 +0100141 unsigned int k0(void) { return _k0; }
142 unsigned int x0(void) { return _x0; }
143 unsigned int multi(void) { return _multi; }
144 unsigned int index(void) { return _index; }
145 bool done(void) { return _done; }
146 bool newkblock(void) { return _newkblock; }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000147 };
148
149 // A working size: One of these needed, regardless of thread count. Divided according to window.
Anthony Barbier5f707732018-07-03 16:22:02 +0100150 size_t get_a_working_size() const {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100151 return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000152 }
153
154 // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
Anthony Barbier5f707732018-07-03 16:22:02 +0100155 size_t get_b_working_size() const {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000156 return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
157 }
158
159 // C working size: One needed per thread.
Anthony Barbier5f707732018-07-03 16:22:02 +0100160 size_t get_c_working_size() const {
David Manselld93991e2018-07-06 14:52:52 +0100161 return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000162 }
163
164 // Internal execute function.
165 // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
Anthony Barbier5f707732018-07-03 16:22:02 +0100166 template<bool pretransposed>
167 void execute_internal(unsigned int start, unsigned int end, int threadid) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100168#ifdef CYCLE_PROFILING
Pablo Telloeb82fd22018-02-23 13:43:50 +0000169 profiler prof;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100170#endif
Pablo Telloeb82fd22018-02-23 13:43:50 +0000171 strategy strat(_ci);
172
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100173 blockwalker current(*this);
Anthony Barbier5f707732018-07-03 16:22:02 +0100174 blockwalker next=current;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000175
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100176 /* Translate 'start' and 'end' into a position within the batches and rows. */
David Manselld93991e2018-07-06 14:52:52 +0100177 const unsigned int window_per_batch = _Mround / strategy::out_height();
Anthony Barbier5f707732018-07-03 16:22:02 +0100178 unsigned int batch_0 = start / window_per_batch;
179 unsigned int batch_end = end / window_per_batch;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100180
Pablo Telloeb82fd22018-02-23 13:43:50 +0000181 /* Compute the M values to operate on */
David Manselld93991e2018-07-06 14:52:52 +0100182 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
183 unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000184
185 /* Make sure we've been set up correctly. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100186 if (pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000187 assert(_B_transposed);
Anthony Barbier5f707732018-07-03 16:22:02 +0100188 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000189 assert(_bm);
190 }
191
192 assert(_working_space);
193 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
194
Anthony Barbier5f707732018-07-03 16:22:02 +0100195 // Private buffers. Treat working_space as an array of C buffers
196 // (one per thread) first, followed by the (window-divided) A
197 // buffer.
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100198 // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
Anthony Barbier5f707732018-07-03 16:22:02 +0100199 Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
200 Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000201
202 // Shared buffers - these come either from BufferManager or _B_transposed.
203 const Toi *b_panel;
204
Anthony Barbier5f707732018-07-03 16:22:02 +0100205 if (pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000206 b_panel = _B_transposed;
207 }
208
209 //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
210
211 // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
212 int kern_k = 0;
213
Anthony Barbier5f707732018-07-03 16:22:02 +0100214 for (;!current.done();current.advance()) {
215 if (current.newkblock()) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100216#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100217 auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100218#endif
Anthony Barbier5f707732018-07-03 16:22:02 +0100219 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
220 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100221 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
222
Anthony Barbier5f707732018-07-03 16:22:02 +0100223 if (first_m >= last_m)
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100224 continue;
Anthony Barbier5f707732018-07-03 16:22:02 +0100225
David Manselld93991e2018-07-06 14:52:52 +0100226 strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
227 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
228 this->_lda, first_m, last_m, current.k0(), current.kmax(), _trA);
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100229 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000230
231 // Figure out how many "K" the kernel will actually process.
David Manselld93991e2018-07-06 14:52:52 +0100232 kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
233 kern_k *= strat.k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000234 }
235
David Manselld93991e2018-07-06 14:52:52 +0100236 int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000237
Anthony Barbier5f707732018-07-03 16:22:02 +0100238 if (!pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000239 /* Look ahead to the next block and populate it if necessary.
240 * This avoids the populate operation becoming a bottleneck, and
241 * helps keep the threads synchronized (the first thread to get
242 * here will populate while the rest will advance).
243 *
244 * If we are running single threaded, bm->try_populate() will do
245 * nothing.
246 */
Anthony Barbier5f707732018-07-03 16:22:02 +0100247 if (next.advance()) {
248 _bm->try_populate(next.index(), [&](void *buffer) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100249#ifdef CYCLE_PROFILING
Anthony Barbier5f707732018-07-03 16:22:02 +0100250 auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100251#endif
Pablo Telloeb82fd22018-02-23 13:43:50 +0000252
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100253 Toi *b_panel = reinterpret_cast<Toi *>(buffer);
David Manselld93991e2018-07-06 14:52:52 +0100254
255 strat.transforms.PrepareB(b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
256 next.x0(), next.xmax(), next.k0(), next.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000257 });
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100258 }
Anthony Barbier5f707732018-07-03 16:22:02 +0100259
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100260 /* Get the buffer for this iteration from the BufferManager. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100261 b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100262#ifdef CYCLE_PROFILING
Anthony Barbier5f707732018-07-03 16:22:02 +0100263 auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100264#endif
265
266 Toi *b_panel = reinterpret_cast<Toi *>(bpv);
David Manselld93991e2018-07-06 14:52:52 +0100267
268 strat.transforms.PrepareB(b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
269 current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000270 }));
271 }
272
273 /* Do the actual work. */
Anthony Barbier5f707732018-07-03 16:22:02 +0100274 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
275 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100276 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000277
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100278 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
279
Anthony Barbier5f707732018-07-03 16:22:02 +0100280 if (first_m >= last_m)
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100281 continue;
282
David Manselld93991e2018-07-06 14:52:52 +0100283 for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
284 unsigned int ymax = std::min(_Msize, y + strategy::out_height());
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100285
286 {
287#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100288 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100289#endif
290
291 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
292
David Manselld93991e2018-07-06 14:52:52 +0100293 a_ptr += (strategy::out_height() * kern_k);
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100294 }
295
296 {
297#ifdef CYCLE_PROFILING
David Manselld93991e2018-07-06 14:52:52 +0100298 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100299#endif
David Manselld93991e2018-07-06 14:52:52 +0100300 strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
301 c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
302 _alpha, (current.k0()==0 ? _beta : static_cast<Tr>(1)));
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100303 }
304 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000305 }
306
Anthony Barbier5f707732018-07-03 16:22:02 +0100307 if (pretransposed) {
David Manselld93991e2018-07-06 14:52:52 +0100308 b_panel += (bblocks * strat.out_width() * kern_k);
Anthony Barbier5f707732018-07-03 16:22:02 +0100309 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000310 _bm->release(current.index());
311 }
312 }
313 }
314
315public:
316 GemmInterleaved(GemmInterleaved &) = delete;
Anthony Barbier5f707732018-07-03 16:22:02 +0100317 GemmInterleaved & operator= (GemmInterleaved &) = delete;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000318
319 /* Constructor */
David Manselle39334c2018-07-06 17:53:35 +0100320 GemmInterleaved(const GemmArgs<Tr> &args)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100321 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
322 _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
323 _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
324 _pretransposed(args._pretransposed_hint) {
David Manselle39334c2018-07-06 17:53:35 +0100325 const unsigned int L1_size = _ci->get_L1_cache_size();
326 const unsigned int L2_size = _ci->get_L2_cache_size();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000327
David Manselle39334c2018-07-06 17:53:35 +0100328 assert(_maxthreads > 0);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000329
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000330 // Work out blocking parameters, or override from provided GemmConfig
331 if (args._cfg && args._cfg->inner_block_size) {
332 _k_block = args._cfg->inner_block_size;
333 } else {
334 // k_block: Find out how much of the larger array can be loaded into half the cache.
335 // This should account for associative caches.
336 _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000337
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000338 // Needs to be (at least a single) multiple of the K unroll level.
339 _k_block /= strategy::k_unroll();
340 _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000341
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000342 // Now tune to presented problem size; this is how many blocks we need.
Georgios Pinitas1d480652019-01-23 11:24:50 +0000343 unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000344
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000345 // So divide the space equally into that many blocks.
346 _k_block = iceildiv(_Ksize, num_k_blocks);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000347
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000348 // And round UP to the K unroll level required.
349 _k_block = iceildiv(_k_block, strategy::k_unroll());
350 _k_block *= strategy::k_unroll();
351 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000352
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000353 if (args._cfg && args._cfg->outer_block_size) {
354 _x_block = args._cfg->outer_block_size;
355 } else {
356 // x_block: Work out how many rows (of length k_block) will fit in the L2
357 // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
358 _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100359 (sizeof(Toi) * _k_block);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000360
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000361 // Needs to be (at least a single) multiple of the kernel output width.
362 _x_block /= strategy::out_width();
363 _x_block = std::max(_x_block, 1U) * strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000364
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000365 // And tune to the presented problem size.
Georgios Pinitas1d480652019-01-23 11:24:50 +0000366 unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000367 _x_block = iceildiv(_Nsize, num_x_blocks);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000368
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000369 _x_block = iceildiv(_x_block, strategy::out_width());
370 _x_block *= strategy::out_width();
371 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000372
373 // Work out the rounded size of M - needed for some buffers.
David Manselle39334c2018-07-06 17:53:35 +0100374 _Mround = iceildiv(_Msize, strategy::out_height());
David Manselld93991e2018-07-06 14:52:52 +0100375 _Mround *= strategy::out_height();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000376 }
377
378 // Interface implementation - Compulsory functions
379
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100380 // Window size: Only the last thread should do a ragged block, so dole
381 // out work in units of out_height. Factor batches into the window, but
382 // not multi for now (as this would cause problems with the buffer
383 // manager).
Anthony Barbier5f707732018-07-03 16:22:02 +0100384 unsigned int get_window_size() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000385 // _Mround is a multiple of out_height by definition.
David Manselld93991e2018-07-06 14:52:52 +0100386 return (_Mround / strategy::out_height()) * _nbatches;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000387 }
388
389 // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
Anthony Barbier5f707732018-07-03 16:22:02 +0100390 void set_nthreads(int nthreads) override {
391 _nthreads = std::min(nthreads, _maxthreads);
392 if (_bm) {
393 _bm->set_nthreads(_nthreads);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000394 }
395 }
396
397 // Execute
Anthony Barbier5f707732018-07-03 16:22:02 +0100398 void execute(unsigned int start, unsigned int end, int threadid) override {
399 if (_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000400 execute_internal<true>(start, end, threadid);
Anthony Barbier5f707732018-07-03 16:22:02 +0100401 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000402 execute_internal<false>(start, end, threadid);
403 }
404 }
405
406 // Interface implementation - working space
Anthony Barbier5f707732018-07-03 16:22:02 +0100407 size_t get_working_size() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000408 // In all cases, we need one A buffer plus a C buffer per thread.
409 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
410
411 // For pretransposed case, there is no working space needed for B.
412 // Otherwise, we need a BufferManager.
Anthony Barbier5f707732018-07-03 16:22:02 +0100413 if (!_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000414 size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
415 }
416
417 size += 64; // Add on a cache line extra for alignment.
418
419 return size;
420 }
421
Anthony Barbier5f707732018-07-03 16:22:02 +0100422 void set_working_space(void *working_space) override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000423 // Make sure everything ends up cache line aligned
424 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
Anthony Barbier5f707732018-07-03 16:22:02 +0100425 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000426
Anthony Barbier5f707732018-07-03 16:22:02 +0100427 size_t diff=0;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000428
Anthony Barbier5f707732018-07-03 16:22:02 +0100429 if (working_space_int & 0x3F) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000430 diff = 0x40 - (working_space_int & 0x3F);
431 }
432
433 working_space_bytes += diff;
434
Anthony Barbier5f707732018-07-03 16:22:02 +0100435 if (_pretransposed) {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000436 // Pretransposed case: just set internal pointer to parameter value.
437 _working_space = reinterpret_cast<void *>(working_space_bytes);
Anthony Barbier5f707732018-07-03 16:22:02 +0100438 } else {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000439 // Otherwise, use the first part of the working space for the buffer manager.
440 // It's legal to call this again so don't leak a buffer manager if it already existed.
441 delete _bm;
442
Anthony Barbier5f707732018-07-03 16:22:02 +0100443 _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
Pablo Telloeb82fd22018-02-23 13:43:50 +0000444
445 working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
446
447 _working_space = reinterpret_cast<void *>(working_space_bytes);
448 }
449 }
450
451 // Interface implementation - pretransposed
Anthony Barbier5f707732018-07-03 16:22:02 +0100452 bool B_is_pretransposed() const override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000453 return _pretransposed;
454 }
455
Anthony Barbier5f707732018-07-03 16:22:02 +0100456 bool B_pretranspose_required() const override {
457 return _pretransposed && (_B_transposed==nullptr);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000458 }
459
460 // TODO: this could almost certainly be considerably simpler.
Anthony Barbier5f707732018-07-03 16:22:02 +0100461 size_t get_B_pretransposed_array_size() const override {
462 size_t total=0;
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100463 blockwalker current(*this);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000464
Anthony Barbier5f707732018-07-03 16:22:02 +0100465 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000466 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +0000467 unsigned int x_size = (current.xmax() - current.x0());
468 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000469
470 /* Round sizes up as needed. */
David Manselld93991e2018-07-06 14:52:52 +0100471 x_size = iceildiv(x_size, strategy::out_width());
472 x_size *= strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000473
David Manselld93991e2018-07-06 14:52:52 +0100474 k_size = iceildiv(k_size, strategy::k_unroll());
475 k_size *= strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000476
477 total += x_size * k_size * sizeof(Toi);
Anthony Barbier5f707732018-07-03 16:22:02 +0100478 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000479
480 return total;
481 }
482
Anthony Barbier5f707732018-07-03 16:22:02 +0100483 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100484 blockwalker current(*this);
Anthony Barbier5f707732018-07-03 16:22:02 +0100485 Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
486 _B_transposed = buffer;
David Manselld93991e2018-07-06 14:52:52 +0100487 strategy strat(_ci);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000488
Anthony Barbier5f707732018-07-03 16:22:02 +0100489 do {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000490 /* Figure out the size of each block. */
Georgios Pinitas1d480652019-01-23 11:24:50 +0000491 unsigned int x_size = (current.xmax() - current.x0());
492 unsigned int k_size = (current.kmax() - current.k0());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000493
494 /* Round sizes up as needed. */
David Manselld93991e2018-07-06 14:52:52 +0100495 x_size = iceildiv(x_size, strategy::out_width());
496 x_size *= strategy::out_width();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000497
David Manselld93991e2018-07-06 14:52:52 +0100498 k_size = iceildiv(k_size, strategy::k_unroll());
499 k_size *= strategy::k_unroll();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000500
David Manselld93991e2018-07-06 14:52:52 +0100501 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
502 current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000503
504 buffer += (x_size * k_size);
Anthony Barbier5f707732018-07-03 16:22:02 +0100505 } while (current.advance());
Pablo Telloeb82fd22018-02-23 13:43:50 +0000506 }
507
Anthony Barbier5f707732018-07-03 16:22:02 +0100508 void set_pretransposed_B_data(void *in_buffer) override {
Michalis Spyroue7e96e02018-04-13 13:44:10 +0100509 _B_transposed = reinterpret_cast<Toi *>(in_buffer);
510 }
511
Anthony Barbier5f707732018-07-03 16:22:02 +0100512 ~GemmInterleaved() override {
Pablo Telloeb82fd22018-02-23 13:43:50 +0000513 delete _bm;
514 }
515};
516
517} // namespace arm_gemm