Blame - src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp - ml/ComputeLibrary

blob: 376d19cc656313d5433a9c85e68be5185c3ea3f5 [file] [log] [blame]

Joseph Dobson	6f8b17d	2020-02-11 19:32:11 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2020 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#pragma once
				25
				26	#include "arm_gemm.hpp"
				27	#include "utils.hpp"
				28
				29	#include "mergeresults.hpp"
				30	#include "transform.hpp"
				31
				32	#ifdef CYCLE_PROFILING
				33	#include "profiler.hpp"
				34	#endif
				35
				36	#include <algorithm>
				37	#include <cassert>
				38
				39	// Some macros used to decide how much working space to allocate.
				40	// Round allocations up to the next cache line.
				41	#define ALLOC_ROUND 64
				42	#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
				43
				44	// Implementation of the GemmCommon abstract class.
				45	//
				46	// This implementation interleaves the source matrices in blocks - good for
				47	// larger matrices.
				48	namespace arm_gemm {
				49
				50	template<typename strategy, typename To, typename Tr>
				51	class GemmInterleaved2d : public GemmCommon<To, Tr> {
				52	typedef typename strategy::operand_type Toi;
				53	typedef typename strategy::result_type Tri;
				54
				55	/* const properties set by constructor */
				56	const CPUInfo * const _ci;
				57
				58	const unsigned int _Msize;
				59	const unsigned int _Nsize;
				60	const unsigned int _Ksize;
				61
				62	const unsigned int _nbatches;
				63	const unsigned int _nmulti;
				64
				65	const bool _trA;
				66	const bool _trB;
				67
				68	const Activation _act;
				69
				70	const int _maxthreads;
				71	int _nthreads;
				72
				73	/* Blocking info */
				74	unsigned int _k_block=0;
				75	unsigned int _x_block=0;
				76
				77	unsigned int _Mround_div=0;
				78	unsigned int _Mround=0;
				79	unsigned int _Nround_div=0;
				80	unsigned int _Nround=0;
				81
				82	/* Working space, pretransposed buffer */
				83	void *_working_space=nullptr;
				84
				85	/* We will need to walk through the blocks of B in a few contexts, so
				86	* factor that out. */
				87	class blockwalker {
				88	private:
				89	/* Size loops, etc. based on our parent's configuration */
				90	const GemmInterleaved2d<strategy, To, Tr> &_parent;
				91
				92	/* K, X and multi parameters for current iteration. */
				93	unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
				94
				95	unsigned int _index=0;
				96	bool _done=false;
				97	bool _newkblock=true;
				98	bool _newmulti=true;
				99
				100	public:
				101	blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent)
				102	: _parent(parent)
				103	, _xmax { parent._Nsize }
				104	{ }
				105
				106	blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
				107	: _parent(parent)
				108	, _x0 { x0 }
				109	, _xmin { x0 }
				110	, _xmax { xmax }
				111	{
				112	assert(_x0 <= _xmax);
				113	}
				114
				115	unsigned int xmax() {
				116	return std::min(_x0 + _parent._x_block, _xmax);
				117	}
				118
				119	unsigned int kmax() {
				120	return std::min(_k0 + _parent._k_block, _parent._Ksize);
				121	}
				122
				123	/* Advance to the next block, return false at the end. */
				124	bool advance(void) {
				125	if (_done) {
				126	return false;
				127	}
				128
				129	_newkblock=false;
				130	_x0 += _parent._x_block;
				131	if (_x0 >= _xmax) {
				132	_x0=_xmin;
				133	_k0 += _parent._k_block;
				134	if (_k0 >= _parent._Ksize) {
				135	_k0=0;
				136	_multi++;
				137	if (_multi >= _parent._nmulti) {
				138	_done=true;
				139	return false;
				140	}
				141	_newmulti=true;
				142	}
				143	_newkblock=true;
				144	}
				145	_index++;
				146
				147	return true;
				148	}
				149
				150	unsigned int k0(void) { return _k0; }
				151	unsigned int x0(void) { return _x0; }
				152	unsigned int multi(void) { return _multi; }
				153	unsigned int index(void) { return _index; }
				154	bool done(void) { return _done; }
				155	bool newkblock(void) { return _newkblock; }
				156	};
				157
				158	// A working size: One of these needed, regardless of thread count. Divided according to window.
				159	size_t get_a_working_size() const {
				160	return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
				161	}
				162
				163	// B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
				164	size_t get_b_working_size() const {
				165	return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
				166	}
				167
				168	// C working size: One needed per thread.
				169	size_t get_c_working_size() const {
				170	return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
				171	}
				172
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	173	void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int nthreadid) {
Joseph Dobson	6f8b17d	2020-02-11 19:32:11 +0000	[diff] [blame]	174	strategy strat(_ci);
				175
				176	/* Translate 'start' and 'end' into a position within the batches and rows. */
				177	const unsigned int window_per_batch = _Mround / strategy::out_height();
				178	unsigned int batch_0 = m_start / window_per_batch;
				179	unsigned int batch_end = m_end / window_per_batch;
				180
				181	/* Compute the M values to operate on */
				182	unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
				183	unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
				184
				185	unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start);
				186	unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
				187
				188	blockwalker current(*this, n_0, n_max);
				189
				190	/* get workspace as int8_t */
				191	assert(_working_space);
				192	int8_t working_space_bytes = reinterpret_cast<int8_t >(_working_space);
				193
				194	auto c_panel_start = working_space_bytes;
				195	auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
				196	auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads;
				197
				198	auto c_panel = reinterpret_cast<Tri >(c_panel_start + get_c_working_size() threadid);
				199	auto a_panel = reinterpret_cast<Toi >(a_panel_start + get_a_working_size() nthreadid);
				200	auto b_panel = reinterpret_cast<Toi >(b_panel_start + get_b_working_size() threadid);
				201
				202
				203	// newkblock() is always true on the first iteration, so this will be set properly on the first loop.
				204
				205	int kern_k = 0;
				206	for (;!current.done();current.advance()) {
				207	const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
				208	/*
				209	* The entirity of A^kblock is transpose upfront and computed against individual
				210	* blocks of B (xblock)
				211	*
				212	* Therefore, we only need to retranspose when k_block progresses
				213	*/
				214	if (current.newkblock()) {
				215	for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
				216	unsigned int first_m = (batch == batch_0) ? m_0 : 0;
				217	unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
				218
				219	if (first_m >= last_m)
				220	continue;
				221
				222	auto a_thread_panel_in = this->_Aptr
				223	+ (batch * this->_A_batch_stride)
				224	+ (current.multi() * this->_A_multi_stride);
				225
				226	auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
				227
				228	strat.transforms.PrepareA(
				229	a_thread_panel_out,
				230	a_thread_panel_in,
				231	this->_lda,
				232	first_m,
				233	last_m,
				234	current.k0(),
				235	current.kmax(),
				236	_trA);
				237	}
				238
				239	kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
				240	kern_k *= strat.k_unroll();
				241	}
				242
				243	auto b_panel_in = this->_Bptr + (current.multi() this->_B_multi_stride);
				244
				245	strat.transforms.PrepareB(
				246	b_panel, //dst
				247	b_panel_in, //src
				248	this->_ldb,
				249	current.x0(), //idx from
				250	current.xmax(), //idx to
				251	current.k0(),
				252	current.kmax(),
				253	_trB);
				254
				255	//Iterate over the batches
				256	for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
				257	unsigned int first_m = (batch == batch_0) ? m_0 : 0;
				258	unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
				259
				260	if (first_m >= last_m)
				261	continue;
				262
				263	const Toi a_ptr = a_panel + (batch _Mround + first_m) * _k_block;
				264
				265
				266	//Iterate over the inerleaved rows of the packed A matrix
				267	for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
				268	unsigned int ymax = std::min(_Msize, y + strategy::out_height());
				269
				270	strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
				271	a_ptr += (strategy::out_height() * kern_k);
				272
				273	const bool first_pass = current.k0()==0;
				274	const bool last_pass = current.kmax()==_Ksize;
				275
				276	auto c_panel_out = this->_Cptr
				277	+ this->_C_batch_stride * batch
				278	+ this->_C_multi_stride * current.multi();
				279
				280	auto bias = (first_pass && this->_bias)
				281	? this->_bias + (current.multi() * this->_bias_multi_stride)
				282	: nullptr;
				283
				284	auto act = last_pass ? _act : Activation();
				285
				286	strat.transforms.Merge(
				287	c_panel_out,
				288	c_panel,
				289	this->_ldc,
				290	y,
				291	ymax,
				292	current.x0(),
				293	current.xmax(),
				294	bias,
				295	act,
				296	!first_pass); //Append
				297	}
				298	}
				299	}
				300	}
				301	public:
				302	GemmInterleaved2d(GemmInterleaved2d &) = delete;
				303	GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete;
				304
				305	/* Constructor */
				306	/* Constructor */
				307	GemmInterleaved2d(const GemmArgs &args)
				308	: _ci(args._ci)
				309	, _Msize(args._Msize)
				310	, _Nsize(args._Nsize)
				311	, _Ksize(args._Ksize)
				312	, _nbatches(args._nbatches)
				313	, _nmulti(args._nmulti)
				314	, _trA(args._trA)
				315	, _trB(args._trB)
				316	, _act(args._act)
				317	, _maxthreads(args._maxthreads)
				318	, _nthreads(args._maxthreads)
				319
				320	// Work out the rounded size of M - needed for some buffers.
				321	, _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
				322	, _Mround ( _Mround_div * strategy::out_height() )
				323
				324	, _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
				325	, _Nround ( _Nround_div * strategy::out_width() )
				326	{
				327	const unsigned int L1_size = _ci->get_L1_cache_size();
				328	const unsigned int L2_size = _ci->get_L2_cache_size();
				329
				330	assert(_maxthreads > 0);
				331
				332	// Work out blocking parameters, or override from provided GemmConfig
				333	if (args._cfg && args._cfg->inner_block_size) {
				334	_k_block = args._cfg->inner_block_size;
				335	} else {
				336	// k_block: Find out how much of the larger array can be loaded into half the cache.
				337	// This should account for associative caches.
				338	_k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
				339
				340	// Needs to be (at least a single) multiple of the K unroll level.
				341	_k_block /= strategy::k_unroll();
				342	_k_block = std::max(_k_block, 1U) * strategy::k_unroll();
				343
				344	// Now tune to presented problem size; this is how many blocks we need.
				345	unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
				346
				347	// So divide the space equally into that many blocks.
				348	_k_block = iceildiv(_Ksize, num_k_blocks);
				349
				350	// And round UP to the K unroll level required.
				351	_k_block = iceildiv(_k_block, strategy::k_unroll());
				352	_k_block *= strategy::k_unroll();
				353	}
				354
				355	if (args._cfg && args._cfg->outer_block_size) {
				356	_x_block = args._cfg->outer_block_size;
				357	} else {
				358	// x_block: Work out how many rows (of length k_block) will fit in the L2
				359	// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
				360	_x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
				361	(sizeof(Toi) * _k_block);
				362
				363	// Needs to be (at least a single) multiple of the kernel output width.
				364	_x_block /= strategy::out_width();
				365	_x_block = std::max(_x_block, 1U) * strategy::out_width();
				366
				367	// And tune to the presented problem size.
				368	unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
				369	_x_block = iceildiv(_Nsize, num_x_blocks);
				370
				371	_x_block = iceildiv(_x_block, strategy::out_width());
				372	_x_block *= strategy::out_width();
				373	}
				374
				375	// Work out the rounded size of M - needed for some buffers.
				376	}
				377
				378	// Interface implementation - Compulsory functions
				379	ndrange_t get_window_size() const override {
				380	unsigned m = (_Mround / strategy::out_height()) * _nbatches;
				381	unsigned n = _Nround_div;
				382
Georgios Pinitas	5aa1a0b	2020-07-02 20:02:20 +0100	[diff] [blame]	383	return { m, n };
Joseph Dobson	6f8b17d	2020-02-11 19:32:11 +0000	[diff] [blame]	384	}
				385
				386	// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
				387	void set_nthreads(int nthreads) override {
				388	_nthreads = std::min(nthreads, _maxthreads);
				389	}
				390
				391	void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
				392	/*
				393	* This particular GEMM implementation can only be broken up over the M & N
				394	* dimensions, we inform the frame work of this limitation via the get_window_size function
				395	*/
Joseph Dobson	6f8b17d	2020-02-11 19:32:11 +0000	[diff] [blame]	396	const auto m_start = work_range.get_position(0);
				397	const auto n_start = work_range.get_position(1);
				398	const auto m_size = work_range.get_size(0);
				399	const auto n_size = work_range.get_size(1);
				400	const auto m_end = m_start + m_size;
				401	const auto n_end = n_start + n_size;
				402
				403	const auto m_threadid = thread_locator.get_position(0);
				404	const auto n_threadid = thread_locator.get_position(1);
				405
				406	execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
				407	}
				408
				409	std::size_t get_working_size()const override {
				410	/*
				411	* Because we do not know how schedular will break up
				412	* the task, we need to ensure that alloc enough
				413	* space to be able to handle the case where every thread
				414	* is parallelised across B AND also every thrread is parallelised across A
				415	*
				416	* If we parallelise across A, then we only need one buffer of A and 64 buffers of B
				417	* If we parallelise across B, then we only need 64 buffer of B and
				418	*/
				419	return get_c_working_size() * _maxthreads
				420	+ get_a_working_size() * _maxthreads
				421	+ get_b_working_size() * _maxthreads
				422	+ 64; //to account for cacheline alignment
				423	}
				424
				425
				426	void set_working_space(void *working_space) override {
				427	// Make sure everything ends up cache line aligned
				428	int8_t working_space_bytes = reinterpret_cast<int8_t >(working_space);
				429	intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
				430
				431	size_t diff=0;
				432
				433	if (working_space_int & 0x3F) {
				434	diff = 0x40 - (working_space_int & 0x3F);
				435	}
				436
				437	working_space_bytes += diff;
				438
				439	_working_space = reinterpret_cast<void *>(working_space_bytes);
				440	}
				441
				442	~GemmInterleaved2d() override { }
				443	};
				444
				445	} // namespace arm_gemm