Blame - arm_compute/core/NEON/kernels/assembly/Helpers.h - ml/ComputeLibrary

blob: e2a46e96a3ebcea4c94f56dc56bdaaf005d20dc2 [file] [log] [blame]

Anthony Barbier	3d677cc	2018-07-23 16:42:59 +0100	[diff] [blame]	1	/*
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	2	* Copyright (c) 2018-2019 ARM Limited.
Anthony Barbier	3d677cc	2018-07-23 16:42:59 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#ifndef __ARM_COMPUTE_ASSEMBLY_HELPERS_H__
				25	#define __ARM_COMPUTE_ASSEMBLY_HELPERS_H__
				26
				27	#include "arm_compute/core/CPP/CPPTypes.h"
				28	#include "arm_compute/core/Utils.h"
				29
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	30	#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
				31	#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
				32
Anthony Barbier	3d677cc	2018-07-23 16:42:59 +0100	[diff] [blame]	33	namespace arm_compute
				34	{
				35	/** Block sizes to use to break the M, N, K dimension */
				36	struct BlockSizes
				37	{
				38	unsigned int k_block{ 0 }; /*< Block size alon the K dimension /
				39	unsigned int x_block{ 0 }; /*< Block size along the N (x) dimension /
				40	unsigned int m_round{ 0 }; /*< Block size along the M dimension (Must be a multiple of strategy_out_height) /
				41	unsigned int strategy_out_height{ 0 }; /*< Number of rows (M) processed by the selected strategy /
				42	};
				43
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	44	/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics
Anthony Barbier	ac314c2	2018-09-11 17:49:10 +0100	[diff] [blame]	45	*
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	46	* @param[in] input_type Data type of the input tensor.
				47	* @param[in] ci CPU information.
				48	* @param[in] num_threads Maximum number of threads that might be used for the calculations.
				49	* @param[in] p M, N, K sizes.
				50	* @param[in] alpha Alpha value.
				51	* @param[in] beta Beta value.
				52	* @param[in] pretranspose_hint Is B also pretransposed ?
Anthony Barbier	ac314c2	2018-09-11 17:49:10 +0100	[diff] [blame]	53	*
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	54	* @return Kernel description that the assembly heuristics picked for the given configuration
Anthony Barbier	ac314c2	2018-09-11 17:49:10 +0100	[diff] [blame]	55	*/
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	56	arm_gemm::KernelDescription get_gemm_info(DataType input_type,
				57	const CPUInfo &ci,
				58	const unsigned int num_threads,
				59	const INEGEMMWrapperKernel::Params &p,
				60	float alpha,
				61	float beta,
				62	bool pretranspose_hint);
Anthony Barbier	ac314c2	2018-09-11 17:49:10 +0100	[diff] [blame]	63
Anthony Barbier	3d677cc	2018-07-23 16:42:59 +0100	[diff] [blame]	64	/** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used
				65	*
Georgios Pinitas	7cd26d4	2019-01-09 18:35:17 +0000	[diff] [blame]	66	* @param[in] ci CPU information.
Anthony Barbier	3d677cc	2018-07-23 16:42:59 +0100	[diff] [blame]	67	* @param[in] M M dimension.
				68	* @param[in] N N dimension.
				69	* @param[in] K K dimension.
				70	*
				71	* @return Recommeded block sizes to use for the given M, N, K dimensions.
				72	*/
				73	template <typename strategy>
				74	BlockSizes calculate_block_sizes(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
				75	{
				76	BlockSizes bs;
				77
				78	using Toi = typename strategy::operand_type;
				79
				80	const unsigned int L1_size = ci.get_L1_cache_size();
				81	const unsigned int L2_size = ci.get_L2_cache_size();
				82
				83	// Work out blocking parameters
				84
				85	// k_block: Find out how much of the larger array can be loaded into half the cache.
				86	// This should account for associative caches.
				87	bs.k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
				88
				89	// Needs to be (at least a single) multiple of the K unroll level.
				90	bs.k_block /= strategy::k_unroll();
				91	bs.k_block = std::max(bs.k_block, 1U) * strategy::k_unroll();
				92
				93	// Now tune to presented problem size; this is how many blocks we need.
				94	int num_k_blocks = DIV_CEIL(K, bs.k_block);
				95
				96	// So divide the space equally into that many blocks.
				97	bs.k_block = DIV_CEIL(K, num_k_blocks);
				98
				99	// And round UP to the K unroll level required.
				100	bs.k_block = ceil_to_multiple(bs.k_block, strategy::k_unroll());
				101
				102	// x_block: Work out how many rows (of length k_block) will fit in the L2
				103	// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
				104	bs.x_block = (((L2_size * 9) / 10) - (bs.k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / (sizeof(Toi) * bs.k_block);
				105
				106	// Needs to be (at least a single) multiple of the kernel output width.
				107	bs.x_block /= strategy::out_width();
				108	bs.x_block = std::max(bs.x_block, 1U) * strategy::out_width();
				109
				110	// And tune to the presented problem size.
				111	int num_x_blocks = DIV_CEIL(N, bs.x_block);
				112	bs.x_block = DIV_CEIL(N, num_x_blocks);
				113
				114	bs.x_block = ceil_to_multiple(bs.x_block, strategy::out_width());
				115
				116	// Work out the rounded size of M - needed for some buffers.
				117	bs.m_round = ceil_to_multiple(M, strategy::out_height());
				118	bs.strategy_out_height = strategy::out_height();
				119
				120	return bs;
				121	}
				122
				123	} // namespace arm_compute
				124	#endif /* __ARM_COMPUTE_ASSEMBLY_HELPERS_H__ */