blob: 0c1d3a387b16ce56c75bddc7c9aa5a85c11a4382 [file] [log] [blame]
Pablo Telloeb82fd22018-02-23 13:43:50 +00001/*
Milos Puzovic905786e2024-03-26 14:34:30 +00002 * Copyright (c) 2017-2024 Arm Limited.
Pablo Telloeb82fd22018-02-23 13:43:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_gemm.hpp"
25#include "gemm_common.hpp"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000026#include "gemm_hybrid.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000027#include "gemm_hybrid_indirect.hpp"
David Manselle39334c2018-07-06 17:53:35 +010028#include "gemm_implementation.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000029#include "gemm_interleaved.hpp"
David Mansellce8f6052018-05-17 18:51:26 +010030#include "gemv_batched.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000031#include "gemv_pretransposed.hpp"
32
Anthony Barbier5f707732018-07-03 16:22:02 +010033#include "kernels/a32_sgemm_8x6.hpp"
Francesco Petrogalli553f6952022-06-30 10:22:01 +000034#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000035#include "kernels/a64_ffhybrid_fp32_mla_6x16.hpp"
36#include "kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp"
Milos Puzovic905786e2024-03-26 14:34:30 +000037#include "kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp"
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000038#include "kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp"
39#include "kernels/a64_ffinterleaved_fp32_mla_8x12.hpp"
Francesco Petrogalli553f6952022-06-30 10:22:01 +000040#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010041#include "kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp"
42#include "kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp"
43#include "kernels/a64_hybrid_fp32_mla_4x24.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000044#include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
45#include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010046#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000047#include "kernels/a64_sgemm_8x12.hpp"
Michalis Spyrou778b95c2021-04-20 12:15:52 +010048#include "kernels/a64_sgemm_8x6.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000049#include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
50#include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
Pablo Telloeb82fd22018-02-23 13:43:50 +000051
Viet-Hoa Do03b29712022-06-01 11:47:14 +010052#ifdef ARM_COMPUTE_ENABLE_SVE
Francesco Petrogalli553f6952022-06-30 10:22:01 +000053#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +000054#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
55#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
56#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
57#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
Francesco Petrogalli553f6952022-06-30 10:22:01 +000058#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Viet-Hoa Do03b29712022-06-01 11:47:14 +010059#ifdef ARM_COMPUTE_ENABLE_SME2
60#include "kernels/sme2_gemv_fp32_mla_16VL.hpp"
61#include "kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp"
62#include "kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp"
63#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp"
64#include "kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp"
65#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp"
66#include "kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp"
67#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
68#endif // ARM_COMPUTE_ENABLE_SME2
69
Michael Tyler74921ee2023-04-12 17:43:17 +010070#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
71#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
72#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
73#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010074#include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp"
75#include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000076#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
77#include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010078#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000079#include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
80#include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
Viet-Hoa Do03b29712022-06-01 11:47:14 +010081#endif // ARM_COMPUTE_ENABLE_SVE
Georgios Pinitas421405b2018-10-26 19:05:32 +010082
Anthony Barbier5f707732018-07-03 16:22:02 +010083namespace arm_gemm {
84
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000085static const GemmImplementation<float, float> gemm_fp32_methods[] =
86{
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000087// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM.
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000088{
89 GemmMethod::GEMV_BATCHED,
90 "gemv_batched",
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000091 [](const GemmArgs &args) { return args._Msize==1 && args._nbatches>1 && !args._indirect_input; },
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000092 nullptr,
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010093 [](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000094},
95#ifdef __aarch64__
Georgios Pinitas4ee8b152021-07-16 16:16:43 +010096#ifdef ARM_COMPUTE_ENABLE_BF16
97// "fast mode" (BF16) kernels
98GemmImplementation<float, float>::with_estimate(
99 GemmMethod::GEMM_INTERLEAVED,
100 "a64_interleaved_bf16fp32_mmla_8x12",
101 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
102 [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
103 [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>(args); }
104),
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000105
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100106GemmImplementation<float, float>::with_estimate(
107 GemmMethod::GEMM_HYBRID,
108 "a64_hybrid_fp32bf16fp32_mmla_6x16",
109 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
110 [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
111 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
112),
113GemmImplementation<float, float>::with_estimate(
114 GemmMethod::GEMM_HYBRID,
115 "a64_hybrid_fp32bf16fp32_mmla_4x24",
116 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
117 [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
118 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
119),
120#endif // ARM_COMPUTE_ENABLE_BF16
Michalis Spyrou20fca522021-06-07 14:23:57 +0100121#ifdef ARM_COMPUTE_ENABLE_SVE
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100122#ifdef ARM_COMPUTE_ENABLE_SME2
123// SME kernels
124{
125 GemmMethod::GEMM_HYBRID,
126 "sme2_gemv_fp32bf16fp32_dot_16VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100127 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100128 nullptr,
129 [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp32bf16fp32_dot_16VL, float, float>(args); }
130},
131{
132 GemmMethod::GEMM_HYBRID,
133 "sme2_gemv_fp32_mla_16VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100134 [](const GemmArgs &args) { return args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100135 nullptr,
136 [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp32_mla_16VL, float, float>(args); }
137},
138#ifdef ARM_COMPUTE_ENABLE_BF16
139{
140 GemmMethod::GEMM_INTERLEAVED,
141 "sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100142 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100143 [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
David Mansell5c767422024-03-15 16:35:13 +0000144 return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100145 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL, float, float>(args); }
146},
147#endif // ARM_COMPUTE_ENABLE_BF16
148{
149 GemmMethod::GEMM_INTERLEAVED,
150 "sme2_interleaved_nomerge_fp32_mopa_1VLx4VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100151 [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100152 [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
David Mansell5c767422024-03-15 16:35:13 +0000153 return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100154 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL, float, float>(args); }
155},
156#ifdef ARM_COMPUTE_ENABLE_BF16
157{
158 GemmMethod::GEMM_INTERLEAVED,
159 "sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100160 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100161 [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
162 return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
163 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL, float, float>(args); }
164},
165#endif // ARM_COMPUTE_ENABLE_BF16
166{
167 GemmMethod::GEMM_INTERLEAVED,
168 "sme2_interleaved_nomerge_fp32_mopa_4VLx1VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100169 [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100170 [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
171 return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
172 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL, float, float>(args); }
173},
174#ifdef ARM_COMPUTE_ENABLE_BF16
175{
176 GemmMethod::GEMM_INTERLEAVED,
177 "sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100178 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100179 nullptr,
180 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL, float, float>(args); }
181},
182#endif // ARM_COMPUTE_ENABLE_BF16
183{
184 GemmMethod::GEMM_INTERLEAVED,
185 "sme2_interleaved_nomerge_fp32_mopa_2VLx2VL",
Gunes Bayir499b5bc2024-04-26 13:15:05 +0100186 [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100187 nullptr,
188 [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL, float, float>(args); }
189},
190#endif // ARM_COMPUTE_ENABLE_SME2
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100191#ifdef ARM_COMPUTE_ENABLE_BF16
192GemmImplementation<float, float>::with_estimate(
193 GemmMethod::GEMM_INTERLEAVED,
194 "sve_interleaved_bf16fp32_mmla_8x3VL",
195 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
196 [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
197 [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
198),
199GemmImplementation<float, float>::with_estimate(
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100200 GemmMethod::GEMM_HYBRID,
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100201 "sve_hybrid_fp32bf16fp32_mmla_6x4VL",
David Mansellc22e1262024-05-03 13:24:48 +0100202 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100203 [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>::estimate_cycles<float>(args); },
204 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>(args); }
205),
206GemmImplementation<float, float>::with_estimate(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000207 GemmMethod::GEMM_HYBRID,
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100208 "sve_hybrid_fp32bf16fp32_mmla_4x6VL",
David Mansellc22e1262024-05-03 13:24:48 +0100209 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100210 [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
211 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
212),
213#endif // ARM_COMPUTE_ENABLE_BF16
214#ifdef ARM_COMPUTE_ENABLE_SVEF32MM
215// MMLA next due to higher throughput (which is SVE only)
216// Prefer this in all cases, except if fast mode is requested and BF16 is available.
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100217{
218 GemmMethod::GEMM_INTERLEAVED,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000219 "sve_interleaved_fp32_mmla_8x3VL",
Michalis Spyrou20fca522021-06-07 14:23:57 +0100220 [](const GemmArgs &args) { return args._ci->has_svef32mm() && (args._Ksize>4); },
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100221 [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_bf16()); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000222 [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100223},
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100224#endif // ARM_COMPUTE_ENABLE_SVEF32MM
225// SVE kernels
Georgios Pinitasc7b183a2020-03-06 18:12:09 +0000226{
227 GemmMethod::GEMM_HYBRID,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000228 "sve_hybrid_fp32_mla_8x1VL",
Pablo Marquez Telloa50f1932021-03-08 17:27:05 +0000229 [](const GemmArgs &args) { return args._ci->has_sve(); },
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100230 [](const GemmArgs &args) { return (args._Nsize < 12); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000231 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
232},
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100233GemmImplementation<float, float>::with_estimate(
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000234 GemmMethod::GEMM_HYBRID,
235 "sve_hybrid_fp32_mla_6x4VL",
Pablo Marquez Telloa50f1932021-03-08 17:27:05 +0000236 [](const GemmArgs &args) { return args._ci->has_sve(); },
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100237 [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000238 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100239),
240GemmImplementation<float, float>::with_estimate(
241 GemmMethod::GEMM_INTERLEAVED,
242 "sve_interleaved_fp32_mla_8x3VL",
243 [](const GemmArgs &args) { return args._ci->has_sve(); },
244 [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
245 [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
246),
Francesco Petrogalli553f6952022-06-30 10:22:01 +0000247 #ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000248#ifdef ARM_COMPUTE_ENABLE_BF16
249GemmImplementation<float, float>::with_estimate(
250 GemmMethod::GEMM_INTERLEAVED,
251 "sve_ffinterleaved_bf16fp32_mmla_8x3VL",
252 KernelWeightFormat::VL2VL_BL64_BF16,
253 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
254 [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
255 [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
256),
257GemmImplementation<float, float>::with_estimate(
258 GemmMethod::GEMM_HYBRID,
259 "sve_ffhybrid_fp32bf16fp32_mmla_4x6VL",
260 KernelWeightFormat::VL2VL_BL64_BF16,
261 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
262 [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
263 [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
264),
265#endif
266GemmImplementation<float, float>::with_estimate(
267 GemmMethod::GEMM_INTERLEAVED,
268 "sve_ffinterleaved_fp32_mla_8x3VL",
269 KernelWeightFormat::VL1VL_BL32,
270 [](const GemmArgs &args) { return args._ci->has_sve(); },
271 [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
272 [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>(args); }
273),
274GemmImplementation<float, float>::with_estimate(
275 GemmMethod::GEMM_HYBRID,
276 "sve_ffhybrid_fp32_mla_6x4VL",
277 KernelWeightFormat::VL1VL_BL32,
278 [](const GemmArgs &args) { return args._ci->has_sve(); },
279 [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
280 [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32_mla_6x4VL, float, float>(args); }
281),
Francesco Petrogalli553f6952022-06-30 10:22:01 +0000282#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Michalis Spyrou20fca522021-06-07 14:23:57 +0100283#endif // ARM_COMPUTE_ENABLE_SVE
Michalis Spyrou778b95c2021-04-20 12:15:52 +0100284// Cortex-A35 specific kernel - use for any problem on A35, and never in any other cases.
285{
286 GemmMethod::GEMM_INTERLEAVED,
287 "a64_sgemm_8x6",
288 nullptr,
289 [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A35; },
290 [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x6, float, float>(args); }
291},
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000292// Arm® Neon™ hybrid methods
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000293{
294 GemmMethod::GEMM_HYBRID,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000295 "a64_smallK_hybrid_fp32_mla_8x4",
Radu Salavatf1f1f872024-02-27 18:32:26 +0000296 [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input && !args._accumulate; },
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100297 nullptr,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000298 [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_8x4, float, float>(args); }
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100299},
300{
301 GemmMethod::GEMM_HYBRID,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000302 "a64_smallK_hybrid_fp32_mla_6x4",
Radu Salavatf1f1f872024-02-27 18:32:26 +0000303 [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input && !args._accumulate; },
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100304 nullptr,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000305 [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_6x4, float, float>(args); }
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000306},
307{
Georgios Pinitas14613832019-03-01 19:07:11 +0000308 GemmMethod::GEMM_HYBRID,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000309 "a64_hybrid_fp32_mla_8x4",
310 nullptr,
Michalis Spyrou71ac9032019-11-14 14:31:44 +0000311 [](const GemmArgs &args) { return (args._Nsize < 12); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000312 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float>(args); }
Michalis Spyrou71ac9032019-11-14 14:31:44 +0000313},
David Mansell318c9f42020-07-08 13:28:45 +0100314GemmImplementation<float, float>::with_estimate(
Michalis Spyrou71ac9032019-11-14 14:31:44 +0000315 GemmMethod::GEMM_HYBRID,
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100316 "a64_hybrid_fp32_mla_4x24",
317 nullptr,
318 [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>::estimate_cycles<float>(args); },
319 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>(args); }
320),
321GemmImplementation<float, float>::with_estimate(
322 GemmMethod::GEMM_HYBRID,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000323 "a64_hybrid_fp32_mla_6x16",
324 nullptr,
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100325 [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000326 [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
David Mansell318c9f42020-07-08 13:28:45 +0100327),
David Mansell318c9f42020-07-08 13:28:45 +0100328GemmImplementation<float, float>::with_estimate(
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000329 GemmMethod::GEMM_INTERLEAVED,
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000330 "a64_sgemm_8x12",
Gian Marco Iodice463f9762020-05-19 14:12:27 +0100331 nullptr,
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100332 [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles<float>(args); },
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000333 [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
David Mansell318c9f42020-07-08 13:28:45 +0100334),
Francesco Petrogalli553f6952022-06-30 10:22:01 +0000335#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000336#ifdef ARM_COMPUTE_ENABLE_BF16
337// "fast mode" (BF16) kernels
338GemmImplementation<float, float>::with_estimate(
339 GemmMethod::GEMM_INTERLEAVED,
340 "a64_ffinterleaved_bf16fp32_mmla_8x12",
341 KernelWeightFormat::VL256_BL64_BF16,
342 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
343 [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
344 [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>(args); }
345),
346GemmImplementation<float, float>::with_estimate(
347 GemmMethod::GEMM_HYBRID,
348 "a64_ffhybrid_fp32bf16fp32_mmla_4x24",
349 KernelWeightFormat::VL256_BL64_BF16,
350 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
351 [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
352 [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
353),
Milos Puzovic905786e2024-03-26 14:34:30 +0000354GemmImplementation<float, float>::with_estimate(
355 GemmMethod::GEMM_HYBRID,
356 "a64_ffhybrid_fp32bf16fp32_mmla_6x16",
357 KernelWeightFormat::VL256_BL64_BF16,
358 [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
359 [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
360 [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
361),
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000362#endif // BF16
363GemmImplementation<float, float>::with_estimate(
364 GemmMethod::GEMM_INTERLEAVED,
365 "a64_ffinterleaved_fp32_mla_8x12",
366 KernelWeightFormat::VL128_BL32,
367 nullptr,
368 [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>::estimate_cycles<float>(args); },
369 [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>(args); }
370),
371GemmImplementation<float, float>::with_estimate(
372 GemmMethod::GEMM_HYBRID,
373 "a64_ffhybrid_fp32_mla_6x16",
374 KernelWeightFormat::VL128_BL32,
375 nullptr,
376 [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
377 [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32_mla_6x16, float, float>(args); }
378),
Francesco Petrogalli553f6952022-06-30 10:22:01 +0000379#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
David Manselle39334c2018-07-06 17:53:35 +0100380#endif // __aarch64__
381
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000382#ifdef __arm__
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100383{
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000384 GemmMethod::GEMM_INTERLEAVED,
385 "sgemm_8x6",
386 nullptr,
387 nullptr,
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100388 [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, float, float>(args); }
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000389},
390#endif // __arm__
391{
392 GemmMethod::DEFAULT,
393 "",
394 nullptr,
395 nullptr,
396 nullptr
397}
David Manselle39334c2018-07-06 17:53:35 +0100398};
399
400/* Templated function to return this list. */
401template<>
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000402const GemmImplementation<float, float> *gemm_implementation_list<float, float>() {
403 return gemm_fp32_methods;
Pablo Telloeb82fd22018-02-23 13:43:50 +0000404}
405
David Manselle39334c2018-07-06 17:53:35 +0100406/* Explicitly instantiate the external functions for these types. */
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100407template UniqueGemmCommon<float, float> gemm<float, float, Nothing>(const GemmArgs &args, const Nothing &);
Francesco Petrogalli553f6952022-06-30 10:22:01 +0000408template bool has_opt_gemm<float, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
Francesco.Petrogalli@arm.com5fcf22d2022-04-05 10:31:08 +0000409template KernelDescription get_gemm_method<float, float, Nothing>(const GemmArgs &args, const Nothing &);
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100410template std::vector<KernelDescription> get_compatible_kernels<float, float, Nothing> (const GemmArgs &args, const Nothing &);
Pablo Telloeb82fd22018-02-23 13:43:50 +0000411
Georgios Pinitas14613832019-03-01 19:07:11 +0000412} // namespace arm_gemm