Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 1 | /* |
Michele Di Giorgio | 6ad60af | 2020-06-09 14:52:15 +0100 | [diff] [blame^] | 2 | * Copyright (c) 2018-2020 ARM Limited. |
Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
Michalis Spyrou | f464337 | 2019-11-29 16:17:13 +0000 | [diff] [blame] | 24 | #ifndef ARM_COMPUTE_ASSEMBLY_HELPERS_H |
| 25 | #define ARM_COMPUTE_ASSEMBLY_HELPERS_H |
Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 26 | |
| 27 | #include "arm_compute/core/CPP/CPPTypes.h" |
| 28 | #include "arm_compute/core/Utils.h" |
| 29 | |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 30 | #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" |
Michele Di Giorgio | 6ad60af | 2020-06-09 14:52:15 +0100 | [diff] [blame^] | 31 | #include "arm_gemm.hpp" |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 32 | |
Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 33 | namespace arm_compute |
| 34 | { |
| 35 | /** Block sizes to use to break the M, N, K dimension */ |
| 36 | struct BlockSizes |
| 37 | { |
| 38 | unsigned int k_block{ 0 }; /**< Block size alon the K dimension */ |
| 39 | unsigned int x_block{ 0 }; /**< Block size along the N (x) dimension */ |
| 40 | unsigned int m_round{ 0 }; /**< Block size along the M dimension (Must be a multiple of strategy_out_height) */ |
| 41 | unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */ |
| 42 | }; |
| 43 | |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 44 | /** Extracts the kernel description of the selected kernel by the GEMM backend heuristics |
Anthony Barbier | ac314c2 | 2018-09-11 17:49:10 +0100 | [diff] [blame] | 45 | * |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 46 | * @param[in] input_type Data type of the input tensor. |
| 47 | * @param[in] ci CPU information. |
| 48 | * @param[in] num_threads Maximum number of threads that might be used for the calculations. |
| 49 | * @param[in] p M, N, K sizes. |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 50 | * @param[in] activation Activation struct |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 51 | * @param[in] pretranspose_hint Is B also pretransposed ? |
Anthony Barbier | ac314c2 | 2018-09-11 17:49:10 +0100 | [diff] [blame] | 52 | * |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 53 | * @return Kernel description that the assembly heuristics picked for the given configuration |
Anthony Barbier | ac314c2 | 2018-09-11 17:49:10 +0100 | [diff] [blame] | 54 | */ |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 55 | arm_gemm::KernelDescription get_gemm_info(DataType input_type, |
| 56 | const CPUInfo &ci, |
| 57 | const unsigned int num_threads, |
| 58 | const INEGEMMWrapperKernel::Params &p, |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 59 | arm_gemm::Activation activation, |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 60 | bool pretranspose_hint); |
Anthony Barbier | ac314c2 | 2018-09-11 17:49:10 +0100 | [diff] [blame] | 61 | |
Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 62 | /** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used |
| 63 | * |
Georgios Pinitas | 7cd26d4 | 2019-01-09 18:35:17 +0000 | [diff] [blame] | 64 | * @param[in] ci CPU information. |
Anthony Barbier | 3d677cc | 2018-07-23 16:42:59 +0100 | [diff] [blame] | 65 | * @param[in] M M dimension. |
| 66 | * @param[in] N N dimension. |
| 67 | * @param[in] K K dimension. |
| 68 | * |
| 69 | * @return Recommeded block sizes to use for the given M, N, K dimensions. |
| 70 | */ |
| 71 | template <typename strategy> |
| 72 | BlockSizes calculate_block_sizes(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K) |
| 73 | { |
| 74 | BlockSizes bs; |
| 75 | |
| 76 | using Toi = typename strategy::operand_type; |
| 77 | |
| 78 | const unsigned int L1_size = ci.get_L1_cache_size(); |
| 79 | const unsigned int L2_size = ci.get_L2_cache_size(); |
| 80 | |
| 81 | // Work out blocking parameters |
| 82 | |
| 83 | // k_block: Find out how much of the larger array can be loaded into half the cache. |
| 84 | // This should account for associative caches. |
| 85 | bs.k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); |
| 86 | |
| 87 | // Needs to be (at least a single) multiple of the K unroll level. |
| 88 | bs.k_block /= strategy::k_unroll(); |
| 89 | bs.k_block = std::max(bs.k_block, 1U) * strategy::k_unroll(); |
| 90 | |
| 91 | // Now tune to presented problem size; this is how many blocks we need. |
| 92 | int num_k_blocks = DIV_CEIL(K, bs.k_block); |
| 93 | |
| 94 | // So divide the space equally into that many blocks. |
| 95 | bs.k_block = DIV_CEIL(K, num_k_blocks); |
| 96 | |
| 97 | // And round UP to the K unroll level required. |
| 98 | bs.k_block = ceil_to_multiple(bs.k_block, strategy::k_unroll()); |
| 99 | |
| 100 | // x_block: Work out how many rows (of length k_block) will fit in the L2 |
| 101 | // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. |
| 102 | bs.x_block = (((L2_size * 9) / 10) - (bs.k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / (sizeof(Toi) * bs.k_block); |
| 103 | |
| 104 | // Needs to be (at least a single) multiple of the kernel output width. |
| 105 | bs.x_block /= strategy::out_width(); |
| 106 | bs.x_block = std::max(bs.x_block, 1U) * strategy::out_width(); |
| 107 | |
| 108 | // And tune to the presented problem size. |
| 109 | int num_x_blocks = DIV_CEIL(N, bs.x_block); |
| 110 | bs.x_block = DIV_CEIL(N, num_x_blocks); |
| 111 | |
| 112 | bs.x_block = ceil_to_multiple(bs.x_block, strategy::out_width()); |
| 113 | |
| 114 | // Work out the rounded size of M - needed for some buffers. |
| 115 | bs.m_round = ceil_to_multiple(M, strategy::out_height()); |
| 116 | bs.strategy_out_height = strategy::out_height(); |
| 117 | |
| 118 | return bs; |
| 119 | } |
| 120 | |
| 121 | } // namespace arm_compute |
Michalis Spyrou | f464337 | 2019-11-29 16:17:13 +0000 | [diff] [blame] | 122 | #endif /* ARM_COMPUTE_ASSEMBLY_HELPERS_H */ |