Blame - src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp - ml/ComputeLibrary

blob: 41fecc6bec27a253ee746b60f833c38a566bc268 [file] [log] [blame]

Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2017-2020 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#pragma once
				25
				26	#include <alloca.h>
				27
				28	#include <algorithm>
				29	#include <cassert>
				30
				31	#include "arm_gemm.hpp"
				32	#include "bias_adder.hpp"
				33	#include "convolver.hpp"
				34	#include "ndrange.hpp"
				35	#include "performance_parameters.hpp"
				36	#include "transform.hpp"
				37	#include "utils.hpp"
				38
				39	#ifdef CYCLE_PROFILING
				40	#include "profiler.hpp"
				41	#endif
				42
				43	#ifndef UNUSED
				44	#define __I_DEFINED_UNUSED
				45	#define UNUSED(x) ((void)(x))
				46	#endif
				47
				48	namespace arm_gemm {
				49
				50	namespace {
				51
				52	// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
				53	// that.
				54
				55	template<typename OutputStage, bool SeparateQuantize = false>
				56	class run_hybrid_kernel {
				57	public:
				58	template<typename strategy, typename To, typename Tr>
				59	static void run (
				60	#ifdef CYCLE_PROFILING
				61	profiler &prof,
				62	#endif
				63	const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
				64	unsigned int kern_k, const To b_ptr, IndirectOutputArg<Tr> output_arg, const Tr bias_ptr, Activation act, bool accumulate,
				65	const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
				66	};
				67
				68	template<>
				69	template<typename strategy, typename To, typename Tr>
				70	void run_hybrid_kernel<Nothing, false>::run(
				71	#ifdef CYCLE_PROFILING
				72	profiler &prof,
				73	#endif
				74	const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
				75	unsigned int kern_k, const To b_ptr, IndirectOutputArg<Tr> output_arg, const Tr bias_ptr, Activation act, bool accumulate,
				76	const Nothing &, const int32_t *, unsigned int) {
				77	#ifdef CYCLE_PROFILING
				78	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
				79	#endif
				80	UNUSED(kern_k);
				81
Sheri Zhang	b71322d	2021-04-07 20:01:18 +0100	[diff] [blame]	82	/* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
				83	* a partial block and pad the bias for that block. */
				84	if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
				85	/* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
				86	unsigned int N_remainder = N % strategy::out_width();
				87	unsigned int N_bulk = N - N_remainder;
				88
				89	/* Output argument to be used for the tail */
				90	IndirectOutputArg<Tr> offset_output = output_arg;
				91
				92	/* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */
				93	if (N_bulk > 0) {
				94	strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, output_arg, bias_ptr, act, accumulate);
				95
				96	if (output_arg.is_indirect) {
				97	offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
				98	} else {
				99	offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
				100	}
				101	}
				102
				103	/* Pad the bias buffer for the remainder */
				104	Tr bias_pad_buffer = reinterpret_cast<Tr >(alloca(strategy::out_width() * sizeof(Tr)));
				105	memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr));
				106
				107	/* Process the remainder, offsetting the B pointer as needed. */
				108	strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder, b_ptr + (N_bulk * kern_k), offset_output, bias_pad_buffer, act, accumulate);
				109	} else {
				110	strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
				111	}
Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	112	}
				113
				114	template<>
				115	template<typename strategy, typename To, typename Tr>
				116	void run_hybrid_kernel<Requantize32, false>::run(
				117	#ifdef CYCLE_PROFILING
				118	profiler &prof,
				119	#endif
				120	const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
				121	unsigned int kern_k, const To b_ptr, IndirectOutputArg<Tr> output_arg, const Tr , Activation, bool,
				122	const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
				123	#ifdef CYCLE_PROFILING
				124	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
				125	#endif
				126	UNUSED(kern_k);
				127
				128	strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
				129	}
				130
				131	template<>
				132	template<typename strategy, typename To, typename Tr>
				133	void run_hybrid_kernel<Requantize32, true>::run(
				134	#ifdef CYCLE_PROFILING
				135	profiler &prof,
				136	#endif
				137	const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
				138	unsigned int kern_k, const To b_ptr, IndirectOutputArg<Tr> output_arg, const Tr , Activation, bool,
				139	const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
				140	UNUSED(kern_k);
				141	// On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
				142	assert(M <= strategy::out_height());
				143	// We don't yet support indirect output (as the quantizer can't do it).
				144	assert(output_arg.is_indirect == false);
				145
				146	// We need a row sum buffer and intermediate output buffer.
				147	// These go on the stack as they are not too large, using an automatic array and alloca() respectively.
				148	int32_t row_sums[strategy::out_height()];
				149	typename strategy::result_type *result_buffer;
				150
				151	unsigned int output_width = roundup(N, strategy::out_width());
				152
				153	result_buffer = reinterpret_cast<typename strategy::result_type >(alloca(output_width strategy::out_height() * sizeof(typename strategy::result_type)));
				154
				155	{
				156	#ifdef CYCLE_PROFILING
				157	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
				158	#endif
				159	// Perform the GEMM, into the output buffer.
				160	strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
				161	}
				162
				163	if (os.b_offset != 0) {
				164	#ifdef CYCLE_PROFILING
				165	auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
				166	#endif
				167	row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
				168	} else {
				169	memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
				170	}
				171
				172	{
				173	#ifdef CYCLE_PROFILING
				174	auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
				175	#endif
				176	// Quantize
				177	requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
				178	}
				179	}
				180
				181	} // anonymous namespace
				182
				183	// Implementation of the GemmCommon abstract class.
				184	template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
				185	class GemmHybridIndirect : public GemmCommon<To, Tr> {
				186	typedef typename strategy::operand_type Toi;
				187	typedef typename strategy::result_type Tri;
				188
				189	GemmArgs _args;
				190	OutputStage _os = {};
				191
				192	/* Quantized support (in addition to 'output stage' above) */
				193	int32_t *_col_bias = nullptr;
				194
				195	const unsigned int _Ktotal;
				196	const unsigned int _rounded_Ksize;
				197
				198	/* Blocking info */
				199	const unsigned int _k_block;
				200	const unsigned int _n_block;
				201	const unsigned int _Mround;
				202
				203	/* Pretransposed buffer. */
				204	const Toi *_B_transposed=nullptr;
				205
				206	/* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
				207	const To * const * const * _indirect_buf = nullptr;
				208
				209	/* Convolver - only set up for convolution problems, so also doubles as a flag. */
				210	std::unique_ptr<convolver<To>> _convolver = nullptr;
				211
				212	// Array of pointers to output rows
				213	// Tr * const * _output_ptrs;
				214
				215	const NDRange<4> _window_range;
				216
				217	unsigned int get_col_sum_size() const {
				218	if (std::is_same<OutputStage, Requantize32>::value) {
				219	return _args._Nsize * _args._nmulti * sizeof(int32_t);
				220	} else {
				221	return 0;
				222	}
				223	}
				224
				225	static unsigned int get_ktotal(const GemmArgs &args) {
				226	return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
				227	}
				228
				229	static unsigned int compute_k_block(const GemmArgs &args) {
				230	// Some kernels don't support accumulate mode - these can't do K blocking at all.
				231	if (!strategy::supports_accumulate() \|\| std::is_same<OutputStage, Requantize32>::value) {
				232	return get_ktotal(args);
				233	}
				234
				235	if (args._cfg && args._cfg->inner_block_size) {
				236	return args._cfg->inner_block_size;
				237	}
				238
				239	// Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
				240	// datatypes); but don't divide into blocks until we hit 1.5X this size.
				241	unsigned int target_block_size = 2048 / sizeof(To);
				242	auto ktotal = get_ktotal(args);
				243
				244	if (ktotal > ((target_block_size*3)/2)) {
				245	unsigned int target_blocks = iceildiv(ktotal, target_block_size);
				246
				247	unsigned int block_size = iceildiv(ktotal, target_blocks);
				248
				249	block_size = roundup(block_size, strategy::k_unroll());
				250
				251	return block_size;
				252	}
				253
				254	return ktotal;
				255	}
				256
				257	// New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a
				258	// single block.
				259	static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
				260	if (args._cfg && args._cfg->outer_block_size) {
				261	return args._cfg->outer_block_size;
				262	}
				263
				264	if (args._Nsize <= 64) {
				265	return args._Nsize;
				266	}
				267
				268	if ((args._Msize / args._Nsize) > 155) {
				269	return args._Nsize;
				270	}
				271
				272	// "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
				273	// use imply a great deal of repeated work performing the row sums. If row sums are involved, work out how
				274	// much "column" parallelism is going to be required and set the block size accordingly.
				275	if (std::is_same<OutputStage, Requantize32>::value) {
				276	const Requantize32 qp = reinterpret_cast<const Requantize32 >(&os);
				277
				278	// Row sums only needed if b_offset isn't 0
				279	if (qp->b_offset != 0) {
				280	// We can already parallelize across batches, multis and rows (in units of 'out_height')
				281	int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
				282
				283	// If this isn't enough, we will need to split up the columns too.
				284	if (multi_row_parallelism < args._maxthreads) {
				285	unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
				286
				287	unsigned int n_block = iceildiv(args._Nsize, columns_needed);
				288
				289	return roundup(n_block, strategy::out_width());
				290	}
				291
				292	// Multi/Batch/Row parallelism is enough - don't split up the columns.
				293	return args._Nsize;
				294	}
				295	}
				296
				297	if (args._Ksize <= 128 && args._maxthreads <= 16) {
				298	return strategy::out_width() * 3;
				299	}
				300
				301	return strategy::out_width();
				302	}
				303
				304	public:
				305	GemmHybridIndirect(GemmHybridIndirect &) = delete;
				306	GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
				307
				308	/* Constructor */
				309	GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
				310	: _args(args), _os(os), _Ktotal(get_ktotal(args)),
				311	_rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
				312	_k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
				313	_Mround(roundup(args._Msize, strategy::out_height())),
				314	_window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
				315	iceildiv(args._Nsize, _n_block), args._nmulti)
				316	{
				317	// We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
				318	// GemmConfig. Clear out the pointer to avoid accidents.
				319	_args._cfg = nullptr;
				320	}
				321
				322	/* Constructor without OutputStage */
				323	GemmHybridIndirect(const GemmArgs &args)
				324	: _args(args), _Ktotal(get_ktotal(args)),
				325	_rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
				326	_k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
				327	_Mround(roundup(args._Msize, strategy::out_height())),
				328	_window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
				329	iceildiv(args._Nsize, _n_block), args._nmulti)
				330	{
				331	// We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
				332	// GemmConfig. Clear out the pointer to avoid accidents.
				333	_args._cfg = nullptr;
				334	}
				335
				336	// Interface implementation - Compulsory functions
				337	ndrange_t get_window_size() const override {
				338	return { _window_range.total_size() };
				339	}
				340
				341	// This kernel can always be dynamically scheduled.
				342	bool supports_dynamic_scheduling() const override {
				343	return true;
				344	}
				345
				346	// Execute
				347	void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
				348	#ifdef CYCLE_PROFILING
				349	profiler prof;
				350	#endif
				351	strategy strat(_args._ci);
				352
				353	std::vector<const To *> in_row_ptrs;
				354	std::vector<const To * const *> in_row_strings;
				355	std::vector<unsigned int> string_lengths;
				356
				357	// In convolution mode, we need input pointers.
				358	if (_convolver) {
Georgios Pinitas	85e16c2	2021-02-23 20:04:42 +0000	[diff] [blame]	359	in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
				360	in_row_strings.resize(_args._Ksections, nullptr);
Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	361
				362	for (unsigned int i=0; i<_args._Ksections; i++) {
				363	in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
				364	}
				365	}
				366
				367	// In any indirect mode, we need the string lengths.
				368	if (_args._indirect_input) {
				369	string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
				370	}
				371
				372	/* Make sure we've been set up correctly. */
				373	assert(_B_transposed);
				374	static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
				375	// static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
				376
				377	/* For now, each work item implies all the K for a given output
				378	* pixel (so we don't need to synchronize access to the output
				379	* array). So separate the loop over K blocks here. */
				380	for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
				381	unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
				382	unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
				383
				384	const bool first_pass = (k0 == 0);
				385	const bool last_pass = (kmax == _Ktotal);
				386
				387	unsigned int first_section = (k0 / _rounded_Ksize);
				388	unsigned int first_offset = (k0 % _rounded_Ksize);
				389	unsigned int kleft = kern_k;
				390	unsigned int sections=0;
				391	unsigned int offset = first_offset;
				392
				393	if (_args._indirect_input) {
				394	while (kleft) {
				395	// When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
				396	// processed (excluding padding). But the amount we subtract from 'kleft' takes account of any
				397	// padding applied.
				398	string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
				399	kleft -= std::min(kleft, _rounded_Ksize - offset);
				400	sections++;
				401	offset=0;
				402	}
				403	}
				404
				405	auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
				406
				407	if (p.done()) {
				408	return;
				409	}
				410
				411	// Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
				412	// The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
				413	// THe convolution path only generates the pointers for one block of rows at a time.
				414	const bool process_all_rows = (!SeparateQuantize && !_convolver);
				415
				416	do {
				417	const unsigned int m_start = p.dim(0) * strategy::out_height();
				418	const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
				419	// const unsigned int m_end = std::min(m_start + strategy::out_height(), _args._Msize);
				420	const unsigned int batch = p.dim(1);
				421	const unsigned int n0 = p.dim(2) * _n_block;
				422	const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
				423	const unsigned int multi = p.dim(3);
				424
				425	const Toi *b_panel = _B_transposed +
				426	(multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
				427	(k0 * roundup(_args._Nsize, strategy::out_width())) +
				428	(n0 * kern_k);
				429
				430	IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
				431
				432	#ifdef CYCLE_PROFILING
				433	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
				434	#endif
				435	if (_indirect_buf) {
				436	run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
				437	#ifdef CYCLE_PROFILING
				438	prof,
				439	#endif
				440	strat, sections, string_lengths.data(),
				441	IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
				442	(m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
				443	(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
				444	last_pass ? _args._act : Activation(),
				445	!first_pass,
				446	// Quantization parameters
				447	_os, _col_bias+(multi * _args._Nsize), n0);
				448	} else if (_convolver) {
				449	auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
				450
				451	unsigned int pos=0;
				452	auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
				453
				454	while (!conv_rows.finished()) {
				455	unsigned int width, conv_offset;
				456
				457	assert(pos < sections);
				458
				459	std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
				460
				461	if (pos==0) {
				462	assert(conv_offset == first_offset);
				463	}
				464	assert(width == string_lengths[pos]);
				465	pos++;
				466	}
				467	assert(pos == sections);
				468
				469	run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
				470	#ifdef CYCLE_PROFILING
				471	prof,
				472	#endif
				473	strat, sections, string_lengths.data(),
				474	IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
				475	(m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
				476	(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
				477	last_pass ? _args._act : Activation(),
				478	!first_pass,
				479	// Quantization parameters
				480	_os, _col_bias+(multi * _args._Nsize), n0);
				481	} else {
				482	// Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
				483	const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
				484
				485	run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
				486	#ifdef CYCLE_PROFILING
				487	prof,
				488	#endif
				489	strat, 1, &len,
				490	IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
				491	(m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
				492	(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
				493	last_pass ? _args._act : Activation(),
				494	!first_pass,
				495	// Quantization parameters
				496	_os, _col_bias+(multi * _args._Nsize), n0);
				497	}
				498	} while (process_all_rows ? p.next_dim1() : p.next_dim0());
				499	}
				500	}
				501
				502	// Interface implementation - pretransposed
				503	bool B_is_pretransposed() const override {
				504	return true;
				505	}
				506
				507	bool B_pretranspose_required() const override {
				508	return (_B_transposed==nullptr);
				509	}
				510
				511	size_t get_B_pretransposed_array_size() const override {
				512	// Start with actual pretransposed buffer...
				513	size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
				514
				515	// Space for result row pointers (not strictly needed any more but retained for indirect output testing)
				516	size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
				517
				518	if (std::is_same<OutputStage, Requantize32>::value) {
				519	size += get_col_sum_size();
				520	}
				521
				522	return size;
				523	}
				524
				525	void pretranspose_B_array(void in_buffer, const To B, const int ldb, const int B_multi_stride) override {
				526	if (std::is_same<OutputStage, Requantize32>::value) {
				527	_col_bias = reinterpret_cast<int32_t *>(in_buffer);
				528
				529	Requantize32 qp_ptr = reinterpret_cast<Requantize32 >(&_os);
				530
				531	for (unsigned int i=0; i<_args._nmulti; i++) {
				532	// The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
				533	compute_col_sums(qp_ptr, _args._Nsize, _args._Ksize _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
				534	}
				535	}
				536
				537	// Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
				538	uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
				539	Toi buffer = reinterpret_cast<Toi >(buffer_int + get_col_sum_size());
				540	_B_transposed = buffer;
				541
				542	strategy strat(_args._ci);
				543
				544	for (unsigned int multi=0; multi<_args._nmulti; multi++) {
				545	for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
				546	const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
				547
				548	/* Figure out the size of each block. */
				549	unsigned int k_size = kmax - k0;
				550
				551	// We need to insert padding at the end of each K section.
				552	// The computation needed is a little delicate - the coordinates from the block walker are expressed in
				553	// terms of the full, padded, _Ktotal.
				554	// But we need to transform each section with reference to the original, unpadded, input, letting the
				555	// transform pad each section as needed.
				556
				557	// This is needed for computations below.
				558	const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
				559
				560	// The expected output format is also an entire <out_width> columns interleaved, then the next set of
				561	// columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
				562	// a time.
				563	for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
				564	unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
				565
				566	// Track where we are and how much work is left.
				567	unsigned int kpos = k0;
				568	unsigned int kleft = k_size;
				569
				570	while (kleft) {
				571	// Which section are we in? Based on the rounded-up section size.
				572	unsigned int k_section_base = kpos / rounded_section_size;
				573	// How far into the section are we?
				574	unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
				575
				576	// We will either copy the rest of this section, or to the end of the requested length.
				577	unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
				578
				579	strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
				580	x0, xmax,
				581	(k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
				582	(k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
				583
				584	// We need to modify our position based on the ROUNDED version of what we just did.
				585	unsigned int padded_length = roundup(k_length, strategy::k_unroll());
				586
				587	buffer += strategy::out_width() * padded_length;
				588
				589	kpos += padded_length;
				590	kleft -= padded_length;
				591	}
				592	}
				593	}
				594	}
				595	}
				596
				597	void set_pretransposed_B_data(void *in_buffer) override {
				598	// Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
				599	uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
				600	_B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
				601	_col_bias = reinterpret_cast<int32_t *>(in_buffer);
				602	}
				603
				604	// Estimate cycles for given problem given provided parameters
Georgios Pinitas	33e0307	2021-01-14 13:43:40 +0000	[diff] [blame]	605	static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	606	// Note: Current hybrid kernels don't actually round up height (they
				607	// have paths for each possible height). Might need to make this
				608	// configurable in future.
Georgios Pinitas	6f45cf7	2021-02-23 23:41:40 +0000	[diff] [blame]	609	uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	610
				611	float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
				612
				613	// TODO: A bit of a kludge here: current hybrid kernels incur extra
				614	// overhead where the width is not a multiple of kernel width. It's
				615	// most noticable where the overall width is quite low, so add 15%
				616	// penalty for such widths.
				617	if ((args._Nsize < strategy::out_width()) \|\| (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
				618	mac_cycles *= 1.15f;
				619	}
				620
				621	uint64_t total_cycles = mac_cycles;
				622
Georgios Pinitas	33e0307	2021-01-14 13:43:40 +0000	[diff] [blame]	623	// Quantizing kernels with separate quantize need to add in the extra stages.
				624	if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
				625	const Requantize32 qp = reinterpret_cast<const Requantize32 >(&os);
				626
				627	// Row sums: need to consider each value in A (batch * multi * M * K)...
Georgios Pinitas	6f45cf7	2021-02-23 23:41:40 +0000	[diff] [blame]	628	uint64_t rowsum_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * get_ktotal(args);
Georgios Pinitas	33e0307	2021-01-14 13:43:40 +0000	[diff] [blame]	629
				630	// ... but row sums are skipped if B offset==0.
				631	if (qp->b_offset == 0) {
				632	rowsum_bytes = 0;
				633	}
				634
				635	// Use "prepare bytes per cycle" to store "row sum values per cycle".
				636	float rowsum_cycles = static_cast<float>(rowsum_bytes) / params.prepare_bytes_cycle;
				637
				638	// Requantize: need to consider each value in C (batch * multi * M * N)
				639	uint64_t requantize_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * args._Nsize;
				640
				641	// Use "merge bytes per cycle" to store "requantize values per cycle".
				642	float requantize_cycles = static_cast<float>(requantize_bytes) / params.merge_bytes_cycle;
				643
				644	// Recalculate total_cycles with the extra components.
				645	total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
				646	}
				647
Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame]	648	return total_cycles;
				649	}
				650
				651	void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
				652	if (std::is_same<OutputStage, Requantize32>::value) {
				653	Requantize32 qp = reinterpret_cast<Requantize32 >(&_os);
				654
				655	qp->bias = bias;
				656	qp->bias_multi_stride = bias_multi_stride;
				657	}
				658	}
				659
				660	void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
				661	assert(string_len == _args._Ksize);
				662	_indirect_buf = ptr;
				663	}
				664
				665	void set_convolution_parameters(ConvolutionParameters parms) override {
				666	assert(parms.input_channels == _args._Ksize);
				667	_convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
				668	}
				669	};
				670
				671	} // namespace arm_gemm
				672
				673	#ifdef __I_DEFINED_UNUSED
				674	#undef UNUSED
				675	#endif