blob: 4b58a7b9ac1b9500d6707b8b0b8c4c3435d96fbe [file] [log] [blame]
giuros0114c4e0f2019-03-26 17:44:40 +00001/*
SiCongLib88272e2021-02-24 15:40:57 +00002 * Copyright (c) 2019-2021 Arm Limited.
giuros0114c4e0f2019-03-26 17:44:40 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
giuros0114c4e0f2019-03-26 17:44:40 +000025
26#include "arm_compute/core/ITensor.h"
giuros0114c4e0f2019-03-26 17:44:40 +000027#include "arm_compute/core/TensorInfo.h"
28#include "arm_compute/core/Types.h"
29#include "arm_compute/core/Utils.h"
30#include "arm_compute/core/Window.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010031
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010034#include "src/core/NEON/wrapper/traits.h"
35#include "src/core/NEON/wrapper/wrapper.h"
SiCongLi410e21e2020-12-11 15:07:53 +000036#include "support/ToolchainSupport.h"
giuros0114c4e0f2019-03-26 17:44:40 +000037
38#include <arm_neon.h>
39#include <cmath>
40#include <complex>
giuros0105fb4482019-03-26 17:44:40 +000041#include <map>
42
giuros0114c4e0f2019-03-26 17:44:40 +000043namespace arm_compute
44{
45namespace
46{
giuros0105fb4482019-03-26 17:44:40 +000047// PI constant (from cmath)
48constexpr float kPi = float(M_PI);
49
50// Constant used in the fft_3 kernel
51constexpr float kSqrt3Div2 = 0.866025403784438;
52
53// Constants used in the fft_5 kernel
54constexpr float kW5_0 = 0.30901699437494f;
55constexpr float kW5_1 = 0.95105651629515f;
56constexpr float kW5_2 = 0.80901699437494f;
57constexpr float kW5_3 = 0.58778525229247f;
58
59// Constants used in the fft_7 kernel
60constexpr float kW7_0 = 0.62348980185873f;
61constexpr float kW7_1 = 0.78183148246802f;
62constexpr float kW7_2 = 0.22252093395631f;
63constexpr float kW7_3 = 0.97492791218182f;
64constexpr float kW7_4 = 0.90096886790241f;
65constexpr float kW7_5 = 0.43388373911755f;
66
67// Constant used in the fft_8 kernel
68constexpr float kSqrt2Div2 = 0.707106781186548;
giuros0114c4e0f2019-03-26 17:44:40 +000069
70float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
71{
giuros0105fb4482019-03-26 17:44:40 +000072 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
giuros0114c4e0f2019-03-26 17:44:40 +000073
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010074 const float32x2_t mask = {-1.0, 1.0};
giuros0105fb4482019-03-26 17:44:40 +000075 const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
76 const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
giuros0114c4e0f2019-03-26 17:44:40 +000077
giuros0105fb4482019-03-26 17:44:40 +000078 float32x2_t res = wrapper::vmul(tmp0, b);
giuros0114c4e0f2019-03-26 17:44:40 +000079
giuros0105fb4482019-03-26 17:44:40 +000080 b = wrapper::vrev64(b);
81 b = wrapper::vmul(b, mask);
82 res = wrapper::vmla(res, tmp1, b);
83
84 return res;
giuros0114c4e0f2019-03-26 17:44:40 +000085}
86
87float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
88{
89 const float a_r = wrapper::vgetlane(a, 0);
90 const float a_i = wrapper::vgetlane(a, 1);
91
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010092 const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
giuros0114c4e0f2019-03-26 17:44:40 +000093 return out;
94}
95
96float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)
97{
98 const auto t0 = wrapper::vadd(a, b);
99 const auto t1 = wrapper::vadd(c, d);
100 const auto t2 = wrapper::vadd(t0, t1);
101 return wrapper::vadd(t2, e);
102}
103
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100104float32x2_t reduce_sum_7(
105 float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
giuros0114c4e0f2019-03-26 17:44:40 +0000106{
107 const auto t0 = wrapper::vadd(x1, x2);
108 const auto t1 = wrapper::vadd(x3, x4);
109 const auto t2 = wrapper::vadd(x5, x6);
110 const auto t00 = wrapper::vadd(t0, t1);
111 const auto t01 = wrapper::vadd(t2, x7);
112
113 return wrapper::vadd(t00, t01);
114}
115
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100116float32x2_t reduce_sum_8(float32x2_t x1,
117 float32x2_t x2,
118 float32x2_t x3,
119 float32x2_t x4,
120 float32x2_t x5,
121 float32x2_t x6,
122 float32x2_t x7,
123 float32x2_t x8)
giuros0114c4e0f2019-03-26 17:44:40 +0000124{
125 const auto t0 = wrapper::vadd(x1, x2);
126 const auto t1 = wrapper::vadd(x3, x4);
127 const auto t2 = wrapper::vadd(x5, x6);
128 const auto t3 = wrapper::vadd(x7, x8);
129 const auto t00 = wrapper::vadd(t0, t1);
130 const auto t01 = wrapper::vadd(t2, t3);
131
132 return wrapper::vadd(t00, t01);
133}
134
135void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w)
136{
137 float32x2_t a = x;
138 float32x2_t b = c_mul_neon(w, y);
139
140 x = wrapper::vadd(a, b);
141 y = wrapper::vsub(a, b);
142}
143
giuros0114c4e0f2019-03-26 17:44:40 +0000144void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2)
145{
146 float32x2_t a = x;
147 float32x2_t b = c_mul_neon(w, y);
148 float32x2_t c = c_mul_neon(w2, z);
149
150 x = wrapper::vadd(a, b);
151 x = wrapper::vadd(x, c);
152
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100153 const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
154 const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
giuros0114c4e0f2019-03-26 17:44:40 +0000155
156 y = z = wrapper::vsub(a, v1);
157 y = wrapper::vadd(y, v2);
158 z = wrapper::vsub(z, v2);
159}
160
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100161void fft_4(float32x2_t &x1,
162 float32x2_t &x2,
163 float32x2_t &x3,
164 float32x2_t &x4,
165 const float32x2_t &w,
166 const float32x2_t &w2,
167 const float32x2_t &w3)
giuros0114c4e0f2019-03-26 17:44:40 +0000168{
169 float32x2_t a = x1;
170 float32x2_t b = c_mul_neon(w, x2);
171 float32x2_t c = c_mul_neon(w2, x3);
172 float32x2_t d = c_mul_neon(w3, x4);
173
174 const auto x11 = wrapper::vadd(a, b);
175 const auto x12 = wrapper::vadd(c, d);
176 x1 = wrapper::vadd(x11, x12);
177
178 const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1));
179 const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f));
180 x2 = wrapper::vadd(x21, x22);
181
182 const auto x31 = wrapper::vadd(a, wrapper::vneg(b));
183 const auto x32 = wrapper::vadd(c, wrapper::vneg(d));
184 x3 = wrapper::vadd(x31, x32);
185
186 const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1));
187 const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1));
188 x4 = wrapper::vadd(x41, x42);
189}
190
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100191void fft_5(float32x2_t &x1,
192 float32x2_t &x2,
193 float32x2_t &x3,
194 float32x2_t &x4,
195 float32x2_t &x5,
196 const float32x2_t &w,
197 const float32x2_t &w2,
198 const float32x2_t &w3,
199 const float32x2_t &w4)
giuros0114c4e0f2019-03-26 17:44:40 +0000200{
201 const auto a = x1;
202 const auto b = c_mul_neon(w, x2);
203 const auto c = c_mul_neon(w2, x3);
204 const auto d = c_mul_neon(w3, x4);
205 const auto e = c_mul_neon(w4, x5);
206
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100207 const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
208 const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
209 const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
210 const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
giuros0114c4e0f2019-03-26 17:44:40 +0000211
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100212 const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
213 const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
214 const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
215 const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
giuros0114c4e0f2019-03-26 17:44:40 +0000216
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100217 const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
218 const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
219 const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
220 const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
giuros0114c4e0f2019-03-26 17:44:40 +0000221
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100222 const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
223 const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
224 const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
225 const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
giuros0114c4e0f2019-03-26 17:44:40 +0000226
227 x1 = reduce_sum_5(a, b, c, d, e);
228 x2 = reduce_sum_5(a, b0, c0, d0, e0);
229 x3 = reduce_sum_5(a, b1, c1, d1, e1);
230 x4 = reduce_sum_5(a, b2, c2, d2, e2);
231 x5 = reduce_sum_5(a, b3, c3, d3, e3);
232}
233
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100234void fft_7(float32x2_t &x1,
235 float32x2_t &x2,
236 float32x2_t &x3,
237 float32x2_t &x4,
238 float32x2_t &x5,
239 float32x2_t &x6,
240 float32x2_t &x7,
241 const float32x2_t &w,
242 const float32x2_t &w2,
243 const float32x2_t &w3,
giuros0114c4e0f2019-03-26 17:44:40 +0000244 const float32x2_t &w4,
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100245 const float32x2_t &w5,
246 const float32x2_t &w6)
giuros0114c4e0f2019-03-26 17:44:40 +0000247{
248 const auto a = x1;
249 const auto b = c_mul_neon(w, x2);
250 const auto c = c_mul_neon(w2, x3);
251 const auto d = c_mul_neon(w3, x4);
252 const auto e = c_mul_neon(w4, x5);
253 const auto f = c_mul_neon(w5, x6);
254 const auto g = c_mul_neon(w6, x7);
255
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100256 const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
257 const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
258 const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
259 const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
260 const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
261 const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
giuros0114c4e0f2019-03-26 17:44:40 +0000262
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100263 const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
264 const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
265 const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
266 const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
267 const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
268 const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
giuros0114c4e0f2019-03-26 17:44:40 +0000269
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100270 const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
271 const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
272 const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
273 const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
274 const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
275 const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
giuros0114c4e0f2019-03-26 17:44:40 +0000276
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100277 const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
278 const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
279 const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
280 const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
281 const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
282 const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
giuros0114c4e0f2019-03-26 17:44:40 +0000283
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100284 const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
285 const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
286 const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
287 const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
288 const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
289 const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
giuros0114c4e0f2019-03-26 17:44:40 +0000290
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100291 const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
292 const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
293 const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
294 const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
295 const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
296 const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
giuros0114c4e0f2019-03-26 17:44:40 +0000297
298 x1 = reduce_sum_7(a, b, c, d, e, f, g);
299 x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
300 x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
301 x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
302 x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
303 x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
304 x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
305}
306
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100307void fft_8(float32x2_t &x1,
308 float32x2_t &x2,
309 float32x2_t &x3,
310 float32x2_t &x4,
311 float32x2_t &x5,
312 float32x2_t &x6,
313 float32x2_t &x7,
314 float32x2_t &x8,
315 const float32x2_t &w,
316 const float32x2_t &w2,
giuros0114c4e0f2019-03-26 17:44:40 +0000317 const float32x2_t &w3,
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100318 const float32x2_t &w4,
319 const float32x2_t &w5,
320 const float32x2_t &w6,
giuros0114c4e0f2019-03-26 17:44:40 +0000321 const float32x2_t &w7)
322{
323 const auto a = x1;
324 const auto b = c_mul_neon(w, x2);
325 const auto c = c_mul_neon(w2, x3);
326 const auto d = c_mul_neon(w3, x4);
327 const auto e = c_mul_neon(w4, x5);
328 const auto f = c_mul_neon(w5, x6);
329 const auto g = c_mul_neon(w6, x7);
330 const auto h = c_mul_neon(w7, x8);
331
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100332 const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
333 const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
334 const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
335 const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
336 const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
337 const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
338 const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
giuros0114c4e0f2019-03-26 17:44:40 +0000339
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100340 const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
341 const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
342 const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
343 const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
344 const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
345 const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
346 const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
giuros0114c4e0f2019-03-26 17:44:40 +0000347
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100348 const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
349 const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
350 const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
351 const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
352 const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
353 const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
354 const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
giuros0114c4e0f2019-03-26 17:44:40 +0000355
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100356 const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
357 const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
358 const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
359 const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
360 const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
361 const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
362 const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
giuros0114c4e0f2019-03-26 17:44:40 +0000363
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100364 const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
365 const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
366 const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
367 const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
368 const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
369 const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
370 const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
giuros0114c4e0f2019-03-26 17:44:40 +0000371
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100372 const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
373 const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
374 const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
375 const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
376 const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
377 const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
378 const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
giuros0114c4e0f2019-03-26 17:44:40 +0000379
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100380 const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
381 const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
382 const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
383 const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
384 const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
385 const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
386 const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
giuros0114c4e0f2019-03-26 17:44:40 +0000387
388 x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
389 x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
390 x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
391 x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
392 x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
393 x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
394 x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
395 x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
396}
397
398template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100399void fft_radix_2_axes_0(
400 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0114c4e0f2019-03-26 17:44:40 +0000401{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100402 float32x2_t w{1.0f, 0.0f};
403 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000404 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100405 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000406 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100407 auto a = float32x2_t{0, 0};
408 auto b = float32x2_t{0, 0};
giuros0114c4e0f2019-03-26 17:44:40 +0000409
410 // Load inputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100411 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000412 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100413 const auto ab = wrapper::vloadq(in + k);
giuros0114c4e0f2019-03-26 17:44:40 +0000414 a = wrapper::vgetlow(ab);
415 b = wrapper::vgethigh(ab);
416 }
417 else
418 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100419 a = wrapper::vload(in + k);
420 b = wrapper::vload(in + k + 2 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000421 }
422
423 // Base-case prime transform
424 fft_2(a, b, w);
425
426 // Write outputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100427 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000428 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100429 wrapper::vstore(out + k, wrapper::vcombine(a, b));
giuros0114c4e0f2019-03-26 17:44:40 +0000430 }
431 else
432 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100433 wrapper::vstore(out + k, a);
434 wrapper::vstore(out + k + 2 * Nx, b);
giuros0114c4e0f2019-03-26 17:44:40 +0000435 }
436 }
437
438 w = c_mul_neon(w, w_m);
439 }
440}
441
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100442void fft_radix_2_axes_1(float *out,
443 float *in,
444 unsigned int Nx,
445 unsigned int NxRadix,
446 const float32x2_t &w_m,
447 unsigned int N,
448 unsigned int M,
449 unsigned int in_pad_x,
450 unsigned int out_pad_x)
giuros0114c4e0f2019-03-26 17:44:40 +0000451{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100452 float32x2_t w{1.0f, 0.0f};
453 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000454 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100455 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000456 {
457 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100458 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
459 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
giuros0114c4e0f2019-03-26 17:44:40 +0000460
giuros0105fb4482019-03-26 17:44:40 +0000461 // Base-case prime transform
462 fft_2(a, b, w);
463
464 // Write outputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100465 wrapper::vstore(out + (N + out_pad_x) * k, a);
466 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
giuros0105fb4482019-03-26 17:44:40 +0000467 }
468
469 w = c_mul_neon(w, w_m);
470 }
471}
472
473template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100474void fft_radix_3_axes_0(
475 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0105fb4482019-03-26 17:44:40 +0000476{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100477 float32x2_t w{1.0f, 0.0f};
478 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000479 {
480 const auto w2 = c_mul_neon(w, w);
481
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100482 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000483 {
484 // Load inputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100485 float32x2_t a = {0, 0};
486 float32x2_t b = {0, 0};
487 float32x2_t c = {0, 0};
488 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000489 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100490 const auto ab = wrapper::vloadq(in + k);
giuros0114c4e0f2019-03-26 17:44:40 +0000491 a = wrapper::vgetlow(ab);
492 b = wrapper::vgethigh(ab);
493 }
494 else
495 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100496 a = wrapper::vload(in + k);
497 b = wrapper::vload(in + k + 2 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000498 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100499 c = wrapper::vload(in + k + 4 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000500
501 // Base-case prime transform
502 fft_3(a, b, c, w, w2);
503
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100504 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000505 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100506 wrapper::vstore(out + k, wrapper::vcombine(a, b));
giuros0114c4e0f2019-03-26 17:44:40 +0000507 }
508 else
509 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100510 wrapper::vstore(out + k, a);
511 wrapper::vstore(out + k + 2 * Nx, b);
giuros0114c4e0f2019-03-26 17:44:40 +0000512 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100513 wrapper::vstore(out + k + 4 * Nx, c);
giuros0114c4e0f2019-03-26 17:44:40 +0000514 }
515 w = c_mul_neon(w, w_m);
516 }
517}
518
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100519void fft_radix_3_axes_1(float *out,
520 float *in,
521 unsigned int Nx,
522 unsigned int NxRadix,
523 const float32x2_t &w_m,
524 unsigned int N,
525 unsigned int M,
526 unsigned int in_pad_x,
527 unsigned int out_pad_x)
giuros0114c4e0f2019-03-26 17:44:40 +0000528{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100529 float32x2_t w{1.0f, 0.0f};
530 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000531 {
532 const auto w2 = c_mul_neon(w, w);
giuros0114c4e0f2019-03-26 17:44:40 +0000533
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100534 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000535 {
536 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100537 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
538 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
539 float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));
giuros0114c4e0f2019-03-26 17:44:40 +0000540
giuros0105fb4482019-03-26 17:44:40 +0000541 // Base-case prime transform
542 fft_3(a, b, c, w, w2);
543
544 // Store the output
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100545 wrapper::vstore(out + (N + out_pad_x) * k, a);
546 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
547 wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);
giuros0105fb4482019-03-26 17:44:40 +0000548 }
549 w = c_mul_neon(w, w_m);
550 }
551}
552
553template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100554void fft_radix_4_axes_0(
555 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0105fb4482019-03-26 17:44:40 +0000556{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100557 float32x2_t w{1.0f, 0.0f};
558 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000559 {
560 const auto w2 = c_mul_neon(w, w);
561 const auto w3 = c_mul_neon(w2, w);
562
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100563 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000564 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100565 float32x2_t a = {0, 0};
566 float32x2_t b = {0, 0};
567 float32x2_t c = {0, 0};
568 float32x2_t d = {0, 0};
569 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000570 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100571 const auto ab = wrapper::vloadq(in + k);
572 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000573 a = wrapper::vgetlow(ab);
574 b = wrapper::vgethigh(ab);
575 c = wrapper::vgetlow(cd);
576 d = wrapper::vgethigh(cd);
577 }
578 else
579 {
580 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100581 a = wrapper::vload(in + k);
582 b = wrapper::vload(in + k + 2 * Nx);
583 c = wrapper::vload(in + k + 4 * Nx);
584 d = wrapper::vload(in + k + 6 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000585 }
586
587 // Base-case prime transform
588 fft_4(a, b, c, d, w, w2, w3);
589
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100590 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000591 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100592 wrapper::vstore(out + k, wrapper::vcombine(a, b));
593 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
giuros0114c4e0f2019-03-26 17:44:40 +0000594 }
595 else
596 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100597 wrapper::vstore(out + k, a);
598 wrapper::vstore(out + k + 2 * Nx, b);
599 wrapper::vstore(out + k + 4 * Nx, c);
600 wrapper::vstore(out + k + 6 * Nx, d);
giuros0114c4e0f2019-03-26 17:44:40 +0000601 }
602 }
603
604 w = c_mul_neon(w, w_m);
605 }
606}
607
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100608void fft_radix_4_axes_1(float *out,
609 float *in,
610 unsigned int Nx,
611 unsigned int NxRadix,
612 const float32x2_t &w_m,
613 unsigned int N,
614 unsigned int M,
615 unsigned int in_pad_x,
616 unsigned int out_pad_x)
giuros0114c4e0f2019-03-26 17:44:40 +0000617{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100618 float32x2_t w{1.0f, 0.0f};
619 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000620 {
621 const auto w2 = c_mul_neon(w, w);
622 const auto w3 = c_mul_neon(w2, w);
giuros0114c4e0f2019-03-26 17:44:40 +0000623
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100624 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000625 {
626 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100627 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
628 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
629 float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));
630 float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));
giuros0114c4e0f2019-03-26 17:44:40 +0000631
giuros0105fb4482019-03-26 17:44:40 +0000632 // Base-case prime transform
633 fft_4(a, b, c, d, w, w2, w3);
634
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100635 wrapper::vstore(out + (N + out_pad_x) * k, a);
636 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
637 wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);
638 wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);
giuros0105fb4482019-03-26 17:44:40 +0000639 }
640
641 w = c_mul_neon(w, w_m);
642 }
643}
644
645template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100646void fft_radix_5_axes_0(
647 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0105fb4482019-03-26 17:44:40 +0000648{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100649 float32x2_t w{1.0f, 0.0f};
650 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000651 {
652 const float32x2_t w2 = c_mul_neon(w, w);
653 const float32x2_t w3 = c_mul_neon(w2, w);
654 const float32x2_t w4 = c_mul_neon(w3, w);
655
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100656 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000657 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100658 float32x2_t a = {0, 0};
659 float32x2_t b = {0, 0};
660 float32x2_t c = {0, 0};
661 float32x2_t d = {0, 0};
662 float32x2_t e = {0, 0};
giuros0114c4e0f2019-03-26 17:44:40 +0000663
664 // Load inputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100665 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000666 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100667 const auto ab = wrapper::vloadq(in + k);
668 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000669
670 a = wrapper::vgetlow(ab);
671 b = wrapper::vgethigh(ab);
672 c = wrapper::vgetlow(cd);
673 d = wrapper::vgethigh(cd);
674 }
675 else
676 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100677 a = wrapper::vload(in + k);
678 b = wrapper::vload(in + k + 2 * Nx);
679 c = wrapper::vload(in + k + 4 * Nx);
680 d = wrapper::vload(in + k + 6 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000681 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100682 e = wrapper::vload(in + k + 8 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000683
684 // Base-case prime transform
685 fft_5(a, b, c, d, e, w, w2, w3, w4);
686
687 // Store outputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100688 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000689 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100690 wrapper::vstore(out + k, wrapper::vcombine(a, b));
691 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
giuros0114c4e0f2019-03-26 17:44:40 +0000692 }
693 else
694 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100695 wrapper::vstore(out + k, a);
696 wrapper::vstore(out + k + 2 * Nx, b);
697 wrapper::vstore(out + k + 4 * Nx, c);
698 wrapper::vstore(out + k + 6 * Nx, d);
giuros0114c4e0f2019-03-26 17:44:40 +0000699 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100700 wrapper::vstore(out + k + 8 * Nx, e);
giuros0114c4e0f2019-03-26 17:44:40 +0000701 }
702
703 w = c_mul_neon(w, w_m);
704 }
705}
706
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100707void fft_radix_5_axes_1(float *out,
708 float *in,
709 unsigned int Nx,
710 unsigned int NxRadix,
711 const float32x2_t &w_m,
712 unsigned int N,
713 unsigned int M,
714 unsigned int in_pad_x,
715 unsigned int out_pad_x)
giuros0114c4e0f2019-03-26 17:44:40 +0000716{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100717 float32x2_t w{1.0f, 0.0f};
718 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000719 {
720 const float32x2_t w2 = c_mul_neon(w, w);
721 const float32x2_t w3 = c_mul_neon(w2, w);
722 const float32x2_t w4 = c_mul_neon(w3, w);
giuros0114c4e0f2019-03-26 17:44:40 +0000723
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100724 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000725 {
726 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100727 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
728 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
729 float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));
730 float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));
731 float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));
giuros0114c4e0f2019-03-26 17:44:40 +0000732
giuros0105fb4482019-03-26 17:44:40 +0000733 // Base-case prime transform
734 fft_5(a, b, c, d, e, w, w2, w3, w4);
735
736 // Store outputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100737 wrapper::vstore(out + (N + out_pad_x) * k, a);
738 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
739 wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);
740 wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);
741 wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);
giuros0105fb4482019-03-26 17:44:40 +0000742 }
743
744 w = c_mul_neon(w, w_m);
745 }
746}
747
748template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100749void fft_radix_7_axes_0(
750 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0105fb4482019-03-26 17:44:40 +0000751{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100752 float32x2_t w{1.0f, 0.0f};
753 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000754 {
755 const float32x2_t w2 = c_mul_neon(w, w);
756 const float32x2_t w3 = c_mul_neon(w2, w);
757 const float32x2_t w4 = c_mul_neon(w3, w);
758 const float32x2_t w5 = c_mul_neon(w4, w);
759 const float32x2_t w6 = c_mul_neon(w5, w);
760
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100761 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000762 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100763 float32x2_t a = {0, 0};
764 float32x2_t b = {0, 0};
765 float32x2_t c = {0, 0};
766 float32x2_t d = {0, 0};
767 float32x2_t e = {0, 0};
768 float32x2_t f = {0, 0};
769 float32x2_t g = {0, 0};
giuros0114c4e0f2019-03-26 17:44:40 +0000770
771 // Load inputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100772 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000773 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100774 const auto ab = wrapper::vloadq(in + k);
775 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
776 const auto ef = wrapper::vloadq(in + k + 8 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000777
778 a = wrapper::vgetlow(ab);
779 b = wrapper::vgethigh(ab);
780 c = wrapper::vgetlow(cd);
781 d = wrapper::vgethigh(cd);
782 e = wrapper::vgetlow(ef);
783 f = wrapper::vgethigh(ef);
784 }
785 else
786 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100787 a = wrapper::vload(in + k);
788 b = wrapper::vload(in + k + 2 * Nx);
789 c = wrapper::vload(in + k + 4 * Nx);
790 d = wrapper::vload(in + k + 6 * Nx);
791 e = wrapper::vload(in + k + 8 * Nx);
792 f = wrapper::vload(in + k + 10 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000793 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100794 g = wrapper::vload(in + k + 12 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000795
796 // Base-case prime transform
797 fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
798
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100799 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000800 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100801 wrapper::vstore(out + k, wrapper::vcombine(a, b));
802 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
803 wrapper::vstore(out + k + 8 * Nx, wrapper::vcombine(e, f));
giuros0114c4e0f2019-03-26 17:44:40 +0000804 }
805 else
806 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100807 wrapper::vstore(out + k, a);
808 wrapper::vstore(out + k + 2 * Nx, b);
809 wrapper::vstore(out + k + 4 * Nx, c);
810 wrapper::vstore(out + k + 6 * Nx, d);
811 wrapper::vstore(out + k + 8 * Nx, e);
812 wrapper::vstore(out + k + 10 * Nx, f);
giuros0114c4e0f2019-03-26 17:44:40 +0000813 }
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100814 wrapper::vstore(out + k + 12 * Nx, g);
giuros0114c4e0f2019-03-26 17:44:40 +0000815 }
816
817 w = c_mul_neon(w, w_m);
818 }
819}
820
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100821void fft_radix_7_axes_1(float *out,
822 float *in,
823 unsigned int Nx,
824 unsigned int NxRadix,
825 const float32x2_t &w_m,
826 unsigned int N,
827 unsigned int M,
828 unsigned int in_pad_x,
829 unsigned int out_pad_x)
giuros0114c4e0f2019-03-26 17:44:40 +0000830{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100831 float32x2_t w{1.0f, 0.0f};
832 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000833 {
834 const float32x2_t w2 = c_mul_neon(w, w);
835 const float32x2_t w3 = c_mul_neon(w2, w);
836 const float32x2_t w4 = c_mul_neon(w3, w);
837 const float32x2_t w5 = c_mul_neon(w4, w);
838 const float32x2_t w6 = c_mul_neon(w5, w);
giuros0114c4e0f2019-03-26 17:44:40 +0000839
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100840 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000841 {
842 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100843 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
844 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
845 float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));
846 float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));
847 float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));
848 float32x2_t f = wrapper::vload(in + (N + in_pad_x) * (k + 10 * Nx));
849 float32x2_t g = wrapper::vload(in + (N + in_pad_x) * (k + 12 * Nx));
giuros0114c4e0f2019-03-26 17:44:40 +0000850
giuros0105fb4482019-03-26 17:44:40 +0000851 // Base-case prime transform
852 fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
853
854 // Store outputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100855 wrapper::vstore(out + (N + out_pad_x) * k, a);
856 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
857 wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);
858 wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);
859 wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);
860 wrapper::vstore(out + (N + out_pad_x) * (k + 10 * Nx), f);
861 wrapper::vstore(out + (N + out_pad_x) * (k + 12 * Nx), g);
giuros0105fb4482019-03-26 17:44:40 +0000862 }
863
864 w = c_mul_neon(w, w_m);
865 }
866}
867
868template <bool first_stage>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100869void fft_radix_8_axes_0(
870 float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
giuros0105fb4482019-03-26 17:44:40 +0000871{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100872 float32x2_t w{1.0f, 0.0f};
873 for (unsigned int j = 0; j < Nx; j++)
giuros0114c4e0f2019-03-26 17:44:40 +0000874 {
875 const float32x2_t w2 = c_mul_neon(w, w);
876 const float32x2_t w3 = c_mul_neon(w2, w);
877 const float32x2_t w4 = c_mul_neon(w3, w);
878 const float32x2_t w5 = c_mul_neon(w4, w);
879 const float32x2_t w6 = c_mul_neon(w5, w);
880 const float32x2_t w7 = c_mul_neon(w6, w);
881
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100882 for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
giuros0114c4e0f2019-03-26 17:44:40 +0000883 {
884 // Load inputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100885 float32x2_t a = {0, 0};
886 float32x2_t b = {0, 0};
887 float32x2_t c = {0, 0};
888 float32x2_t d = {0, 0};
889 float32x2_t e = {0, 0};
890 float32x2_t f = {0, 0};
891 float32x2_t g = {0, 0};
892 float32x2_t h = {0, 0};
giuros0114c4e0f2019-03-26 17:44:40 +0000893
894 // Base-case prime transform
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100895 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000896 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100897 const auto ab = wrapper::vloadq(in + k);
898 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
899 const auto ef = wrapper::vloadq(in + k + 8 * Nx);
900 const auto gh = wrapper::vloadq(in + k + 12 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000901
902 a = wrapper::vgetlow(ab);
903 b = wrapper::vgethigh(ab);
904 c = wrapper::vgetlow(cd);
905 d = wrapper::vgethigh(cd);
906 e = wrapper::vgetlow(ef);
907 f = wrapper::vgethigh(ef);
908 g = wrapper::vgetlow(gh);
909 h = wrapper::vgethigh(gh);
910 }
911 else
912 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100913 a = wrapper::vload(in + k);
914 b = wrapper::vload(in + k + 2 * Nx);
915 c = wrapper::vload(in + k + 4 * Nx);
916 d = wrapper::vload(in + k + 6 * Nx);
917 e = wrapper::vload(in + k + 8 * Nx);
918 f = wrapper::vload(in + k + 10 * Nx);
919 g = wrapper::vload(in + k + 12 * Nx);
920 h = wrapper::vload(in + k + 14 * Nx);
giuros0114c4e0f2019-03-26 17:44:40 +0000921 }
922
923 // Apply twiddle factors
924 fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
925
926 // Store outputs
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100927 if (first_stage)
giuros0114c4e0f2019-03-26 17:44:40 +0000928 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100929 wrapper::vstore(out + k, wrapper::vcombine(a, b));
930 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
931 wrapper::vstore(out + k + 8 * Nx, wrapper::vcombine(e, f));
932 wrapper::vstore(out + k + 12 * Nx, wrapper::vcombine(g, h));
giuros0114c4e0f2019-03-26 17:44:40 +0000933 }
934 else
935 {
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100936 wrapper::vstore(out + k, a);
937 wrapper::vstore(out + k + 2 * Nx, b);
938 wrapper::vstore(out + k + 4 * Nx, c);
939 wrapper::vstore(out + k + 6 * Nx, d);
940 wrapper::vstore(out + k + 8 * Nx, e);
941 wrapper::vstore(out + k + 10 * Nx, f);
942 wrapper::vstore(out + k + 12 * Nx, g);
943 wrapper::vstore(out + k + 14 * Nx, h);
giuros0114c4e0f2019-03-26 17:44:40 +0000944 }
945 }
946
947 w = c_mul_neon(w, w_m);
948 }
949}
950
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100951void fft_radix_8_axes_1(float *out,
952 float *in,
953 unsigned int Nx,
954 unsigned int NxRadix,
955 const float32x2_t &w_m,
956 unsigned int N,
957 unsigned int M,
958 unsigned int in_pad_x,
959 unsigned int out_pad_x)
giuros0105fb4482019-03-26 17:44:40 +0000960{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100961 float32x2_t w{1.0f, 0.0f};
962 for (unsigned int j = 0; j < Nx; j++)
giuros0105fb4482019-03-26 17:44:40 +0000963 {
964 const float32x2_t w2 = c_mul_neon(w, w);
965 const float32x2_t w3 = c_mul_neon(w2, w);
966 const float32x2_t w4 = c_mul_neon(w3, w);
967 const float32x2_t w5 = c_mul_neon(w4, w);
968 const float32x2_t w6 = c_mul_neon(w5, w);
969 const float32x2_t w7 = c_mul_neon(w6, w);
970
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100971 for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
giuros0105fb4482019-03-26 17:44:40 +0000972 {
973 // Load inputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100974 float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
975 float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));
976 float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));
977 float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));
978 float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));
979 float32x2_t f = wrapper::vload(in + (N + in_pad_x) * (k + 10 * Nx));
980 float32x2_t g = wrapper::vload(in + (N + in_pad_x) * (k + 12 * Nx));
981 float32x2_t h = wrapper::vload(in + (N + in_pad_x) * (k + 14 * Nx));
giuros0105fb4482019-03-26 17:44:40 +0000982
983 // Base-case prime transform
984 fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
985
986 // Store outputs
Manuel Bottini9a81cd82021-04-15 17:44:55 +0100987 wrapper::vstore(out + (N + out_pad_x) * k, a);
988 wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);
989 wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);
990 wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);
991 wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);
992 wrapper::vstore(out + (N + out_pad_x) * (k + 10 * Nx), f);
993 wrapper::vstore(out + (N + out_pad_x) * (k + 12 * Nx), g);
994 wrapper::vstore(out + (N + out_pad_x) * (k + 14 * Nx), h);
giuros0105fb4482019-03-26 17:44:40 +0000995 }
996
997 w = c_mul_neon(w, w_m);
998 }
999}
1000
giuros0114c4e0f2019-03-26 17:44:40 +00001001Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
1002{
1003 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
giuros0105fb4482019-03-26 17:44:40 +00001004 ARM_COMPUTE_RETURN_ERROR_ON(config.axis > 1);
giuros0114c4e0f2019-03-26 17:44:40 +00001005 ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
giuros0105fb4482019-03-26 17:44:40 +00001006 ARM_COMPUTE_UNUSED(config);
giuros0114c4e0f2019-03-26 17:44:40 +00001007
1008 // Checks performed when output is configured
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001009 if ((output != nullptr) && (output->total_size() != 0))
giuros0114c4e0f2019-03-26 17:44:40 +00001010 {
1011 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
1012 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
1013 }
1014
1015 return Status{};
1016}
1017
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001018std::pair<Status, Window>
1019validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
giuros0114c4e0f2019-03-26 17:44:40 +00001020{
giuros0105fb4482019-03-26 17:44:40 +00001021 ARM_COMPUTE_UNUSED(config);
1022
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001023 if (output != nullptr)
giuros0114c4e0f2019-03-26 17:44:40 +00001024 {
1025 auto_init_if_empty(*output, *input);
1026 }
1027
giuros0105fb4482019-03-26 17:44:40 +00001028 Window win = calculate_max_window(*input, Steps());
giuros0114c4e0f2019-03-26 17:44:40 +00001029
1030 return std::make_pair(Status{}, win);
1031}
1032} // namespace
1033
1034NEFFTRadixStageKernel::NEFFTRadixStageKernel()
Manuel Bottini9a81cd82021-04-15 17:44:55 +01001035 : _input(nullptr), _output(nullptr), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
giuros0114c4e0f2019-03-26 17:44:40 +00001036{
1037}
1038
giuros0105fb4482019-03-26 17:44:40 +00001039void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo &config)
giuros0114c4e0f2019-03-26 17:44:40 +00001040{
giuros0105fb4482019-03-26 17:44:40 +00001041 // FFT table axis 0: [radix, first_stage]
1042 static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
1043
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001044 if (fft_table_axis0.empty())
giuros0114c4e0f2019-03-26 17:44:40 +00001045 {
giuros0105fb4482019-03-26 17:44:40 +00001046 fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
1047 fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
1048 fft_table_axis0[4][false] = &fft_radix_4_axes_0<false>;
1049 fft_table_axis0[5][false] = &fft_radix_5_axes_0<false>;
1050 fft_table_axis0[7][false] = &fft_radix_7_axes_0<false>;
1051 fft_table_axis0[8][false] = &fft_radix_8_axes_0<false>;
1052
1053 fft_table_axis0[2][true] = &fft_radix_2_axes_0<true>;
1054 fft_table_axis0[3][true] = &fft_radix_3_axes_0<true>;
1055 fft_table_axis0[4][true] = &fft_radix_4_axes_0<true>;
1056 fft_table_axis0[5][true] = &fft_radix_5_axes_0<true>;
1057 fft_table_axis0[7][true] = &fft_radix_7_axes_0<true>;
1058 fft_table_axis0[8][true] = &fft_radix_8_axes_0<true>;
giuros0114c4e0f2019-03-26 17:44:40 +00001059 }
giuros0105fb4482019-03-26 17:44:40 +00001060
1061 _func_0 = fft_table_axis0[config.radix][config.is_first_stage];
1062}
1063
1064void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo &config)
1065{
1066 // FFT table axis 1: [radix, first_stage]
1067 static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
1068
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001069 if (fft_table_axis1.empty())
giuros0105fb4482019-03-26 17:44:40 +00001070 {
1071 fft_table_axis1[2] = &fft_radix_2_axes_1;
1072 fft_table_axis1[3] = &fft_radix_3_axes_1;
1073 fft_table_axis1[4] = &fft_radix_4_axes_1;
1074 fft_table_axis1[5] = &fft_radix_5_axes_1;
1075 fft_table_axis1[7] = &fft_radix_7_axes_1;
1076 fft_table_axis1[8] = &fft_radix_8_axes_1;
1077 }
1078
1079 _func_1 = fft_table_axis1[config.radix];
giuros0114c4e0f2019-03-26 17:44:40 +00001080}
1081
1082void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
1083{
1084 ARM_COMPUTE_ERROR_ON_NULLPTR(input);
1085
1086 // Output auto inizialitation if not yet initialized
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001087 if (output != nullptr)
giuros0114c4e0f2019-03-26 17:44:40 +00001088 {
1089 auto_init_if_empty(*output->info(), *input->info()->clone());
1090 }
1091
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001092 ARM_COMPUTE_ERROR_THROW_ON(
1093 validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
giuros0114c4e0f2019-03-26 17:44:40 +00001094
Manuel Bottini9a81cd82021-04-15 17:44:55 +01001095 _input = input;
1096 _output = (output == nullptr) ? input : output;
1097 _Nx = config.Nx;
1098 _axis = config.axis;
1099 _radix = config.radix;
giuros0114c4e0f2019-03-26 17:44:40 +00001100
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001101 switch (config.axis)
giuros0114c4e0f2019-03-26 17:44:40 +00001102 {
giuros0105fb4482019-03-26 17:44:40 +00001103 case 0:
1104 set_radix_stage_axis0(config);
1105 break;
1106 case 1:
1107 set_radix_stage_axis1(config);
1108 break;
1109 default:
1110 ARM_COMPUTE_ERROR("Axis not supported");
1111 break;
giuros0114c4e0f2019-03-26 17:44:40 +00001112 }
1113
1114 // Configure kernel window
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001115 auto win_config =
1116 validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
giuros0114c4e0f2019-03-26 17:44:40 +00001117 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
1118 INEKernel::configure(win_config.second);
1119}
1120
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001121Status NEFFTRadixStageKernel::validate(const ITensorInfo *input,
1122 const ITensorInfo *output,
1123 const FFTRadixStageKernelInfo &config)
giuros0114c4e0f2019-03-26 17:44:40 +00001124{
1125 const bool run_in_place = (output == nullptr) || (output == input);
1126 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001127 ARM_COMPUTE_RETURN_ON_ERROR(
1128 validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
1129 .first);
giuros0114c4e0f2019-03-26 17:44:40 +00001130
1131 return Status{};
1132}
1133
1134std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
1135{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001136 return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
giuros0114c4e0f2019-03-26 17:44:40 +00001137}
1138
1139void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
1140{
giuros0114c4e0f2019-03-26 17:44:40 +00001141 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1142 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
giuros0105fb4482019-03-26 17:44:40 +00001143 ARM_COMPUTE_UNUSED(info);
giuros0114c4e0f2019-03-26 17:44:40 +00001144
1145 Window input_window = window;
giuros0105fb4482019-03-26 17:44:40 +00001146 input_window.set(_axis, 0);
giuros0114c4e0f2019-03-26 17:44:40 +00001147
1148 Iterator in(_input, input_window);
Manuel Bottini9a81cd82021-04-15 17:44:55 +01001149 Iterator out(_output, input_window);
giuros0114c4e0f2019-03-26 17:44:40 +00001150
giuros0105fb4482019-03-26 17:44:40 +00001151 // Precompute FFT constants
1152 const unsigned int NxRadix = _radix * _Nx;
1153 const float alpha = 2.0f * kPi / float(NxRadix);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001154 const float32x2_t w_m{cosf(alpha), -sinf(alpha)};
giuros0105fb4482019-03-26 17:44:40 +00001155
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001156 if (_axis == 0)
giuros0114c4e0f2019-03-26 17:44:40 +00001157 {
giuros0105fb4482019-03-26 17:44:40 +00001158 const unsigned int N = _input->info()->dimension(0);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001159 execute_window_loop(
1160 input_window,
1161 [&](const Coordinates &) {
1162 _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
1163 N);
1164 },
1165 in, out);
giuros0105fb4482019-03-26 17:44:40 +00001166 }
1167 else
1168 {
1169 const unsigned int N = _input->info()->dimension(0);
1170 const unsigned int M = _input->info()->dimension(1);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001171 execute_window_loop(
1172 input_window,
1173 [&](const Coordinates &)
1174 {
1175 _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
1176 M, _input->info()->padding().right + _input->info()->padding().left,
1177 _output->info()->padding().right + _output->info()->padding().left);
1178 },
1179 in, out);
giuros0105fb4482019-03-26 17:44:40 +00001180 }
giuros0114c4e0f2019-03-26 17:44:40 +00001181
1182 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1183 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1184}
1185} // namespace arm_compute