blob: 760328f3babc5486036e223fdd11c0b9069c1c06 [file] [log] [blame]
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00001/*
Michael Tyler74921ee2023-04-12 17:43:17 +01002 * Copyright (c) 2021-2023 Arm Limited.
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm_gemm_local.hpp"
26
27#include "depthwise_implementation.hpp"
28#include "depthwise_depthfirst.hpp"
29#include "depthwise_depthfirst_generic.hpp"
30#include "depthwise_depthfirst_multiplier.hpp"
ramelg018a164882022-04-07 02:42:52 +010031#include "depthwise_planar.hpp"
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000032
33#include "depthwise_implementation_constraints.hpp"
34
Viet-Hoa Do03b29712022-06-01 11:47:14 +010035#include "interleaves/list.hpp"
36
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000037#if defined(__aarch64__)
Michalis Spyrou20fca522021-06-07 14:23:57 +010038#if defined(ARM_COMPUTE_ENABLE_SVE)
Viet-Hoa Do03b29712022-06-01 11:47:14 +010039#if defined(ARM_COMPUTE_ENABLE_SME2)
40#include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
41#include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
42#include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
43#include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
44
45#include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
46#include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
47#include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
48#include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
49
50#include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
51#include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
52#include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
53#include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
54#endif // defined(ARM_COMPUTE_ENABLE_SME2)
55
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000056#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
57#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
58#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
59#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
60#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
61#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
62#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
63#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
64#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
Michalis Spyrou20fca522021-06-07 14:23:57 +010065#endif // defined(ARM_COMPUTE_ENABLE_SVE)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000066#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
67#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
68#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
69#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
70#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
71#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
72#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
73#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
74#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
75#endif // defined(__aarch64__)
76
77namespace arm_conv {
78namespace depthwise {
79
80namespace
81{
Michael Tyler4cf80672023-07-05 14:32:00 +010082#if defined(__aarch64__)
Michael Tyler8deee9b2023-06-30 11:26:05 +010083 bool prefer_premultiply(const DepthwiseArgs &args) {
84 if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
85 {
86 return false;
87 }
88
89 unsigned int threshold;
90
91 if (args.stride_rows == 1 && args.kernel_rows == 3)
92 {
93 threshold = 18;
94 }
95 else if (args.stride_rows == 1 && args.kernel_rows == 5)
96 {
97 threshold = 5;
98 }
99 else if (args.stride_rows == 2 && args.kernel_rows == 3)
100 {
101 threshold = 5;
102 }
103 else if (args.stride_rows == 2 && args.kernel_rows == 5)
104 {
105 threshold = 12;
106 } else
107 {
108 return false;
109 }
110
111 return args.channel_multiplier <= threshold;
112 }
113
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000114 template <class Strategy>
115 unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
116 {
Michael Tyler8deee9b2023-06-30 11:26:05 +0100117 if (args.channel_multiplier > 1 && !prefer_premultiply(args))
118 {
Michael Tyler4c30de02023-07-07 12:01:32 +0100119 return std::numeric_limits<unsigned int>::max();
Michael Tyler8deee9b2023-06-30 11:26:05 +0100120 }
121
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000122 // First-pass: compute the number of output pixels which will be computed.
123 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
124 arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
125 arm_gemm::iceildiv(
126 (long unsigned) args.input_channels * args.channel_multiplier,
127 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
128 );
129 }
130
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100131 template <class Strategy>
132 unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
133 {
134 // First-pass: compute the number of output pixels which will be computed.
135 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
136 args.output_cols *
137 arm_gemm::iceildiv(
138 (long unsigned) args.input_channels * args.channel_multiplier,
139 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
140 );
141 }
142
Michael Tyler74921ee2023-04-12 17:43:17 +0100143 template <class Strategy>
144 unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
145 {
146 // First-pass: compute the number of output pixels which will be computed.
147 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
148 arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
149 arm_gemm::iceildiv(
150 (long unsigned) args.input_channels * args.channel_multiplier,
151 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
152 ) * 2 / 3;
153 }
154
Michael Tyler8deee9b2023-06-30 11:26:05 +0100155 unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
156 {
Michael Tyler4c30de02023-07-07 12:01:32 +0100157 return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
Michael Tyler8deee9b2023-06-30 11:26:05 +0100158 }
159
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000160 unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
161 {
162 return std::numeric_limits<unsigned int>::max();
163 }
164
ramelg018a164882022-04-07 02:42:52 +0100165 bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
166 bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000167 {
ramelg018a164882022-04-07 02:42:52 +0100168 return args.fast_mode;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000169 }
Freddie Liardet487d3902021-09-21 12:36:43 +0100170#endif // defined(__aarch64__)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000171}
172
173static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
174#if defined(__aarch64__)
Michalis Spyrou20fca522021-06-07 14:23:57 +0100175#if defined(ARM_COMPUTE_ENABLE_SVE)
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100176#if defined(ARM_COMPUTE_ENABLE_SME2)
177 {
178 DepthwiseMethod::PLANAR,
179 "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
180 constraint(fast_mode_enabled,
181 cpu_has_sme, cpu_has_sme2,
182 is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
183 has_no_channel_multiplier, no_prime_right_pad),
184 nullptr,
185 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
186 auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
187 return new DepthwisePlanar<float>(strat, args);
188 },
189 },
190 {
191 DepthwiseMethod::PLANAR,
192 "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
193 constraint(fast_mode_enabled,
194 cpu_has_sme, cpu_has_sme2,
195 is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
196 has_no_channel_multiplier, no_prime_right_pad),
197 nullptr,
198 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
199 auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
200 return new DepthwisePlanar<float>(strat, args);
201 },
202 },
203 {
204 DepthwiseMethod::PLANAR,
205 "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
206 constraint(fast_mode_enabled,
207 cpu_has_sme, cpu_has_sme2,
208 is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
209 has_no_channel_multiplier, no_prime_right_pad),
210 nullptr,
211 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
212 auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
213 return new DepthwisePlanar<float>(strat, args);
214 },
215 },
216 {
217 DepthwiseMethod::PLANAR,
218 "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
219 constraint(fast_mode_enabled,
220 cpu_has_sme, cpu_has_sme2,
221 is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
222 has_no_channel_multiplier, no_prime_right_pad),
223 nullptr,
224 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
225 auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
226 return new DepthwisePlanar<float>(strat, args);
227 },
228 },
229
230 {
231 DepthwiseMethod::PLANAR,
232 "sme2_fp32_planar_3x3_s1_4rows_mla_za",
233 constraint(cpu_has_sme, cpu_has_sme2,
234 is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
235 has_no_channel_multiplier, no_prime_right_pad),
236 [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
237 // Heuristic, don't prefer this kernel unless the input plane is greater
238 // than the number of channels.
239 if (args.input_rows * args.input_cols < args.input_channels)
240 return UINT32_MAX;
241
242 return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
243 },
244 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
245 auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
246 return new DepthwisePlanar<float>(strat, args);
247 },
248 },
249 {
250 DepthwiseMethod::PLANAR,
251 "sme2_fp32_planar_3x3_s2_4rows_mla_za",
252 constraint(cpu_has_sme, cpu_has_sme2,
253 is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
254 has_no_channel_multiplier, no_prime_right_pad),
255 planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
256 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
257 auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
258 return new DepthwisePlanar<float>(strat, args);
259 },
260 },
261 {
262 DepthwiseMethod::PLANAR,
263 "sme2_fp32_planar_5x5_s1_4rows_mla_za",
264 constraint(cpu_has_sme, cpu_has_sme2,
265 is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
266 has_no_channel_multiplier, no_prime_right_pad),
267 nullptr,
268 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
269 auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
270 return new DepthwisePlanar<float>(strat, args);
271 },
272 },
273 {
274 DepthwiseMethod::PLANAR,
275 "sme2_fp32_planar_5x5_s2_4rows_mla_za",
276 constraint(cpu_has_sme, cpu_has_sme2,
277 is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
278 has_no_channel_multiplier, no_prime_right_pad),
279 nullptr,
280 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
281 auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
282 return new DepthwisePlanar<float>(strat, args);
283 },
284 },
285
286 {
287 DepthwiseMethod::DEPTHFIRST,
288 "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
289 constraint(cpu_has_sme, cpu_has_sme2,
Michael Tyler8deee9b2023-06-30 11:26:05 +0100290 is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100291 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
292 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
293 auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
294 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
295 },
296 },
297 {
298 DepthwiseMethod::DEPTHFIRST,
299 "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
300 constraint(cpu_has_sme, cpu_has_sme2,
Michael Tyler8deee9b2023-06-30 11:26:05 +0100301 is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100302 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
303 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
304 auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
305 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
306 },
307 },
308 {
309 DepthwiseMethod::DEPTHFIRST,
310 "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
311 constraint(cpu_has_sme, cpu_has_sme2,
Michael Tyler8deee9b2023-06-30 11:26:05 +0100312 is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100313 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
314 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
315 auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
316 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
317 },
318 },
319 {
320 DepthwiseMethod::DEPTHFIRST,
321 "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
322 constraint(cpu_has_sme, cpu_has_sme2,
Michael Tyler8deee9b2023-06-30 11:26:05 +0100323 is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100324 cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
325 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
326 auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
327 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
328 },
329 },
330#endif // defined(ARM_COMPUTE_ENABLE_SME2)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000331 {
332 DepthwiseMethod::DEPTHFIRST,
333 "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
334 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
Michalis Spyrou20fca522021-06-07 14:23:57 +0100335 cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000336 cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
337 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100338 auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100339 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000340 },
341 },
342 {
343 DepthwiseMethod::DEPTHFIRST,
344 "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
345 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
Michalis Spyrou20fca522021-06-07 14:23:57 +0100346 cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000347 cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
348 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100349 auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100350 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000351 },
352 },
353 {
354 DepthwiseMethod::DEPTHFIRST,
355 "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
356 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
Michalis Spyrou20fca522021-06-07 14:23:57 +0100357 cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000358 cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
359 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100360 auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100361 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000362 },
363 },
364 {
365 DepthwiseMethod::DEPTHFIRST,
366 "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
367 constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
Michalis Spyrou20fca522021-06-07 14:23:57 +0100368 cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000369 cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
370 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100371 auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100372 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000373 },
374 },
375 {
376 DepthwiseMethod::DEPTHFIRST,
377 "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
378 constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
Michalis Spyrou20fca522021-06-07 14:23:57 +0100379 cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000380 cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
381 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100382 auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100383 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000384 },
385 },
386 {
387 DepthwiseMethod::DEPTHFIRST,
388 "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100389 constraint(cpu_has_sve),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000390 not_preferred,
391 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100392 auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
393 auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
394 return new DepthwiseDepthfirstGeneric<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000395 },
396 },
397 {
398 DepthwiseMethod::DEPTHFIRST,
399 "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
Freddie Liardetd216f572021-08-03 15:57:32 +0100400 constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
ramelg018a164882022-04-07 02:42:52 +0100401 cpu_has_sve, has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100402 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000403 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100404 auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
405 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000406 },
407 },
408 {
409 DepthwiseMethod::DEPTHFIRST,
410 "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
Freddie Liardetd216f572021-08-03 15:57:32 +0100411 constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
ramelg018a164882022-04-07 02:42:52 +0100412 cpu_has_sve, has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100413 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000414 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100415 auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
416 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000417 },
418 },
419 {
420 DepthwiseMethod::DEPTHFIRST,
421 "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
ramelg018a164882022-04-07 02:42:52 +0100422 constraint(cpu_has_sve, has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100423 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000424 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100425 auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
426 auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
427 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000428 },
429 },
Michalis Spyrou20fca522021-06-07 14:23:57 +0100430#endif // defined(ARM_COMPUTE_ENABLE_SVE)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000431 {
432 DepthwiseMethod::DEPTHFIRST,
433 "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100434 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000435 cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
436 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100437 auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100438 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000439 },
440 },
441 {
442 DepthwiseMethod::DEPTHFIRST,
443 "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100444 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000445 cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
446 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100447 auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100448 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000449 },
450 },
451 {
452 DepthwiseMethod::DEPTHFIRST,
453 "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100454 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000455 cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
456 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100457 auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100458 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000459 },
460 },
461 {
462 DepthwiseMethod::DEPTHFIRST,
463 "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100464 constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000465 cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
466 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100467 auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100468 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000469 },
470 },
471 {
472 DepthwiseMethod::DEPTHFIRST,
473 "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100474 constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000475 cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
476 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
Freddie Liardetd216f572021-08-03 15:57:32 +0100477 auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
ramelg018a164882022-04-07 02:42:52 +0100478 return new DepthwiseDepthfirst<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000479 },
480 },
481 {
482 DepthwiseMethod::DEPTHFIRST,
483 "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
Michael Tyler8deee9b2023-06-30 11:26:05 +0100484 nullptr,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000485 not_preferred,
486 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100487 auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
488 auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
489 return new DepthwiseDepthfirstGeneric<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000490 },
491 },
492 {
493 DepthwiseMethod::DEPTHFIRST,
494 "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
ramelg018a164882022-04-07 02:42:52 +0100495 constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
496 has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100497 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000498 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100499 auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
500 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000501 },
502 },
503 {
504 DepthwiseMethod::DEPTHFIRST,
505 "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
ramelg018a164882022-04-07 02:42:52 +0100506 constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
507 has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100508 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000509 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100510 auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
511 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000512 },
513 },
514 {
515 DepthwiseMethod::DEPTHFIRST,
516 "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
ramelg018a164882022-04-07 02:42:52 +0100517 constraint(has_channel_multiplier),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100518 multiplier_cycle_estimate,
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000519 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
ramelg018a164882022-04-07 02:42:52 +0100520 auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
521 auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
522 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000523 },
524 },
525#endif // defined(__aarch64__)
526 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
527};
528
529template <>
530const DepthwiseImplementation<float> *depthwise_implementation_list()
531{
532 return depthwise_fp32_methods;
533}
534
535template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
536template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
537
538} // namespace depthwise
539} // namespace arm_conv