Blame - src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp - ml/ComputeLibrary - Gitiles

blob: c4dceef922c9b4d0c9cfafd8aabcdca80309784a [file] [log] [blame]

Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	1	/*
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	2	* Copyright (c) 2017-2020 Arm Limited.
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#pragma once
				25
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	26	#include <algorithm>
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	27	#include <cassert>
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	28
				29	#include "arm_gemm.hpp"
				30	#include "utils.hpp"
				31
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	32	#include "mergeresults.hpp"
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	33	#include "performance_parameters.hpp"
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	34	#include "transform.hpp"
				35
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	36	#ifdef CYCLE_PROFILING
				37	#include "profiler.hpp"
				38	#endif
				39
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	40	// Some macros used to decide how much working space to allocate.
				41	// Round allocations up to the next cache line.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	42	#define ALLOC_ROUND 64
				43	#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	44
				45	// Implementation of the GemmCommon abstract class.
				46	//
				47	// This implementation interleaves the source matrices in blocks - good for
				48	// larger matrices.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	49	namespace arm_gemm {
				50
				51	template<typename strategy, typename To, typename Tr>
				52	class GemmInterleaved : public GemmCommon<To, Tr> {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	53	typedef typename strategy::operand_type Toi;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	54	typedef typename strategy::result_type Tri;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	55
				56	/* const properties set by constructor */
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	57	const CPUInfo * const _ci;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	58
				59	const unsigned int _Msize;
				60	const unsigned int _Nsize;
				61	const unsigned int _Ksize;
				62
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	63	const unsigned int _nbatches;
				64	const unsigned int _nmulti;
				65
Georgios Pinitas	48b3ef8	2019-10-14 19:03:09 +0100	[diff] [blame]	66	const Activation _act;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	67
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	68	const int _maxthreads;
				69	int _nthreads;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	70
				71	/* Blocking info */
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	72	unsigned int _k_block=0;
				73	unsigned int _x_block=0;
				74	unsigned int _Mround=0;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	75
				76	/* Working space, pretransposed buffer, buffer manager */
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	77	const Toi *_B_transposed=nullptr;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	78	void *_working_space=nullptr;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	79
				80	/* We will need to walk through the blocks of B in a few contexts, so
				81	* factor that out. */
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	82	class blockwalker {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	83	private:
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	84	/* Size loops, etc. based on our parent's configuration */
				85	const GemmInterleaved<strategy, To, Tr> &_parent;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	86
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	87	/* K, X and multi parameters for current iteration. */
				88	unsigned int _k0=0, _x0=0, _multi=0;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	89
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	90	unsigned int _index=0;
				91	bool _done=false;
				92	bool _newkblock=true;
				93	bool _newmulti=true;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	94
				95	public:
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	96	blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	97
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	98	unsigned int xmax() {
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	99	return std::min(_x0 + _parent._x_block, _parent._Nsize);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	100	}
				101
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	102	unsigned int kmax() {
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	103	return std::min(_k0 + _parent._k_block, _parent._Ksize);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	104	}
				105
				106	/* Advance to the next block, return false at the end. */
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	107	bool advance(void) {
				108	if (_done) {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	109	return false;
				110	}
				111
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	112	_newkblock=false;
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	113	_x0 += _parent._x_block;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	114	if (_x0 >= _parent._Nsize) {
				115	_x0=0;
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	116	_k0 += _parent._k_block;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	117	if (_k0 >= _parent._Ksize) {
				118	_k0=0;
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	119	_multi++;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	120	if (_multi >= _parent._nmulti) {
				121	_done=true;
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	122	return false;
				123	}
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	124	_newmulti=true;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	125	}
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	126	_newkblock=true;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	127	}
				128	_index++;
				129
				130	return true;
				131	}
				132
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	133	unsigned int k0(void) { return _k0; }
				134	unsigned int x0(void) { return _x0; }
				135	unsigned int multi(void) { return _multi; }
				136	unsigned int index(void) { return _index; }
				137	bool done(void) { return _done; }
				138	bool newkblock(void) { return _newkblock; }
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	139	};
				140
				141	// A working size: One of these needed, regardless of thread count. Divided according to window.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	142	size_t get_a_working_size() const {
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	143	return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	144	}
				145
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	146	// C working size: One needed per thread.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	147	size_t get_c_working_size() const {
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	148	return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	149	}
				150
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	151	static unsigned int get_k_block_size(const GemmArgs &args) {
				152	if (args._cfg && args._cfg->inner_block_size) {
				153	return args._cfg->inner_block_size;
				154	}
				155
				156	const unsigned int L1_size = args._ci->get_L1_cache_size();
				157	unsigned int k_block;
				158
				159	// k_block: Find out how much of the larger array can be loaded into half the cache.
				160	// This should account for associative caches.
				161	k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
				162
				163	// Needs to be (at least a single) multiple of the K unroll level.
				164	k_block /= strategy::k_unroll();
				165	k_block = std::max(k_block, 1U) * strategy::k_unroll();
				166
				167	// Now tune to presented problem size; this is how many blocks we need.
				168	unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
				169
				170	// So divide the space equally into that many blocks.
				171	k_block = iceildiv(args._Ksize, num_k_blocks);
				172
				173	// And round UP to the K unroll level required.
				174	k_block = roundup(k_block, strategy::k_unroll());
				175
				176	return k_block;
				177	}
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	178
				179	public:
				180	GemmInterleaved(GemmInterleaved &) = delete;
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	181	GemmInterleaved & operator= (GemmInterleaved &) = delete;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	182
				183	/* Constructor */
Georgios Pinitas	48b3ef8	2019-10-14 19:03:09 +0100	[diff] [blame]	184	GemmInterleaved(const GemmArgs &args)
Georgios Pinitas	cfa2bba	2019-06-27 17:00:52 +0100	[diff] [blame]	185	: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	186	_nbatches(args._nbatches), _nmulti(args._nmulti),
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	187	_act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
				188	_k_block(get_k_block_size(args)) {
David Mansell	e39334c	2018-07-06 17:53:35 +0100	[diff] [blame]	189	const unsigned int L2_size = _ci->get_L2_cache_size();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	190
David Mansell	e39334c	2018-07-06 17:53:35 +0100	[diff] [blame]	191	assert(_maxthreads > 0);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	192
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	193	// Work out blocking parameters, or override from provided GemmConfig
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	194	// TODO: Move outer block into a static function too.
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	195	if (args._cfg && args._cfg->outer_block_size) {
				196	_x_block = args._cfg->outer_block_size;
				197	} else {
				198	// x_block: Work out how many rows (of length k_block) will fit in the L2
				199	// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
				200	_x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
Georgios Pinitas	cfa2bba	2019-06-27 17:00:52 +0100	[diff] [blame]	201	(sizeof(Toi) * _k_block);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	202
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	203	// Needs to be (at least a single) multiple of the kernel output width.
				204	_x_block /= strategy::out_width();
				205	_x_block = std::max(_x_block, 1U) * strategy::out_width();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	206
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	207	// And tune to the presented problem size.
Georgios Pinitas	1d48065	2019-01-23 11:24:50 +0000	[diff] [blame]	208	unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	209	_x_block = iceildiv(_Nsize, num_x_blocks);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	210
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	211	_x_block = iceildiv(_x_block, strategy::out_width());
				212	_x_block *= strategy::out_width();
				213	}
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	214
				215	// Work out the rounded size of M - needed for some buffers.
David Mansell	e39334c	2018-07-06 17:53:35 +0100	[diff] [blame]	216	_Mround = iceildiv(_Msize, strategy::out_height());
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	217	_Mround *= strategy::out_height();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	218	}
				219
				220	// Interface implementation - Compulsory functions
				221
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	222	// Window size: Only the last thread should do a ragged block, so dole
				223	// out work in units of out_height. Factor batches into the window, but
				224	// not multi for now (as this would cause problems with the buffer
				225	// manager).
Joseph Dobson	6f8b17d	2020-02-11 19:32:11 +0000	[diff] [blame]	226	ndrange_t get_window_size() const override {
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	227	// _Mround is a multiple of out_height by definition.
				228	return { (_Mround / strategy::out_height()) * _nbatches };
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	229	}
				230
				231	// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	232	void set_nthreads(int nthreads) override {
				233	_nthreads = std::min(nthreads, _maxthreads);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	234	}
				235
				236	// Execute
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	237	void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
				238	const auto start = work_range.get_position(0);
				239	const auto end = work_range.get_position_end(0);
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	240	#ifdef CYCLE_PROFILING
				241	profiler prof;
				242	#endif
				243	strategy strat(_ci);
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	244
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	245	blockwalker current(*this);
				246
				247	/* Translate 'start' and 'end' into a position within the batches and rows. */
				248	const unsigned int window_per_batch = _Mround / strategy::out_height();
				249	unsigned int batch_0 = start / window_per_batch;
				250	unsigned int batch_end = end / window_per_batch;
				251
				252	/* Compute the M values to operate on */
				253	unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
				254	unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
				255
				256	/* Make sure we've been set up correctly. */
				257	assert(_B_transposed);
				258	assert(_working_space);
				259	int8_t working_space_bytes = reinterpret_cast<int8_t >(_working_space);
				260
				261	// Private buffers. Treat working_space as an array of C buffers
				262	// (one per thread) first, followed by the (window-divided) A
				263	// buffer.
				264	// Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
				265	Toi * const a_panel = reinterpret_cast<Toi >(working_space_bytes + (_maxthreads get_c_working_size()));
				266	Tri * const c_panel = reinterpret_cast<Tri >(working_space_bytes + (threadid get_c_working_size()));
				267
				268	const Toi *b_panel;
				269	b_panel = _B_transposed;
				270
				271	//printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
				272
				273	// newkblock() is always true on the first iteration, so this will be set properly on the first loop.
				274	int kern_k = 0;
				275
				276	for (;!current.done();current.advance()) {
				277	if (current.newkblock()) {
				278	#ifdef CYCLE_PROFILING
				279	auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
				280	#endif
				281	for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
				282	unsigned int first_m = (batch == batch_0) ? m_0 : 0;
				283	unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
				284
				285	if (first_m >= last_m)
				286	continue;
				287
				288	strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
				289	this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
				290	this->_lda, first_m, last_m, current.k0(), current.kmax());
				291	}
				292
				293	// Figure out how many "K" the kernel will actually process.
				294	kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
				295	kern_k *= strat.k_unroll();
				296	}
				297
				298	int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
				299
				300	/* Do the actual work. */
				301	for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
				302	unsigned int first_m = (batch == batch_0) ? m_0 : 0;
				303	unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
				304
				305	const Toi a_ptr = a_panel + (batch _Mround + first_m) * _k_block;
				306
				307	if (first_m >= last_m)
				308	continue;
				309
				310	for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
				311	unsigned int ymax = std::min(_Msize, y + strategy::out_height());
				312
				313	{
				314	#ifdef CYCLE_PROFILING
				315	auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
				316	#endif
				317
				318	strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
				319
				320	a_ptr += (strategy::out_height() * kern_k);
				321	}
				322
				323	{
				324	#ifdef CYCLE_PROFILING
				325	auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
				326	#endif
				327	/* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
				328	const bool first_pass = current.k0()==0;
				329	const bool last_pass = current.kmax()==_Ksize;
				330
				331	strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
				332	c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
				333	((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
				334	(last_pass ? _act : Activation()), !first_pass);
				335	}
				336	}
				337	}
				338
				339	b_panel += (bblocks * strat.out_width() * kern_k);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	340	}
				341	}
				342
				343	// Interface implementation - working space
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	344	size_t get_working_size() const override {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	345	// In all cases, we need one A buffer plus a C buffer per thread.
				346	size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
				347
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	348	size += 64; // Add on a cache line extra for alignment.
				349
				350	return size;
				351	}
				352
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	353	void set_working_space(void *working_space) override {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	354	// Make sure everything ends up cache line aligned
				355	int8_t working_space_bytes = reinterpret_cast<int8_t >(working_space);
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	356	intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	357
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	358	size_t diff=0;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	359
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	360	if (working_space_int & 0x3F) {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	361	diff = 0x40 - (working_space_int & 0x3F);
				362	}
				363
				364	working_space_bytes += diff;
				365
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	366	// Pretransposed case: just set internal pointer to parameter value.
				367	_working_space = reinterpret_cast<void *>(working_space_bytes);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	368	}
				369
				370	// Interface implementation - pretransposed
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	371	bool B_is_pretransposed() const override {
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	372	return true;
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	373	}
				374
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	375	bool B_pretranspose_required() const override {
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	376	return (_B_transposed==nullptr);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	377	}
				378
				379	// TODO: this could almost certainly be considerably simpler.
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	380	size_t get_B_pretransposed_array_size() const override {
				381	size_t total=0;
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	382	blockwalker current(*this);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	383
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	384	do {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	385	/* Figure out the size of each block. */
Georgios Pinitas	1d48065	2019-01-23 11:24:50 +0000	[diff] [blame]	386	unsigned int x_size = (current.xmax() - current.x0());
				387	unsigned int k_size = (current.kmax() - current.k0());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	388
				389	/* Round sizes up as needed. */
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	390	x_size = iceildiv(x_size, strategy::out_width());
				391	x_size *= strategy::out_width();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	392
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	393	k_size = iceildiv(k_size, strategy::k_unroll());
				394	k_size *= strategy::k_unroll();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	395
				396	total += x_size * k_size * sizeof(Toi);
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	397	} while (current.advance());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	398
				399	return total;
				400	}
				401
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	402	void pretranspose_B_array(void in_buffer, const To B, const int ldb, const int B_multi_stride) override {
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	403	blockwalker current(*this);
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	404	Toi buffer = reinterpret_cast<Toi >(in_buffer);
				405	_B_transposed = buffer;
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	406	strategy strat(_ci);
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	407
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	408	do {
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	409	/* Figure out the size of each block. */
Georgios Pinitas	1d48065	2019-01-23 11:24:50 +0000	[diff] [blame]	410	unsigned int x_size = (current.xmax() - current.x0());
				411	unsigned int k_size = (current.kmax() - current.k0());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	412
				413	/* Round sizes up as needed. */
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	414	x_size = iceildiv(x_size, strategy::out_width());
				415	x_size *= strategy::out_width();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	416
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	417	k_size = iceildiv(k_size, strategy::k_unroll());
				418	k_size *= strategy::k_unroll();
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	419
David Mansell	d93991e	2018-07-06 14:52:52 +0100	[diff] [blame]	420	strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
Georgios Pinitas	0cc50ed	2020-07-06 19:10:38 +0100	[diff] [blame]	421	current.x0(), current.xmax(), current.k0(), current.kmax());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	422
				423	buffer += (x_size * k_size);
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	424	} while (current.advance());
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	425	}
				426
Anthony Barbier	5f70773	2018-07-03 16:22:02 +0100	[diff] [blame]	427	void set_pretransposed_B_data(void *in_buffer) override {
Michalis Spyrou	e7e96e0	2018-04-13 13:44:10 +0100	[diff] [blame]	428	_B_transposed = reinterpret_cast<Toi *>(in_buffer);
				429	}
David Mansell	318c9f4	2020-07-08 13:28:45 +0100	[diff] [blame]	430
				431	// Estimate cycles for given problem given provided parameters
				432	static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
				433	unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
				434
				435	uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
				436	uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
				437	uint64_t merge_bytes = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
				438
				439	float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
				440	float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
				441	float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
				442
				443	float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
				444
				445	// We can't thread over multis or width, which makes this a poor
				446	// choice in many threaded cases. Penalize that here.
				447	float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
				448
				449	if (parallelism_available < args._maxthreads) {
				450	total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
				451	}
				452
				453	return static_cast<uint64_t>(total_cycles);
				454	}
Pablo Tello	eb82fd2	2018-02-23 13:43:50 +0000	[diff] [blame]	455	};
				456
				457	} // namespace arm_gemm