blob: 5f1ef2483b7b0f2cd04df1f335a89179535968cc [file] [log] [blame]
Georgios Pinitas0bc78492019-03-18 20:07:37 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** Computes the digit reverse stage
27 *
28 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
29 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
30 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
31 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
32 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
33 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
34 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
35 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
36 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
37 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
38 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
39 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
40 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
41 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
42 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
43 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
44 * @param[in] idx_ptr Pointer to the index tensor. Supported data types: U32
45 * @param[in] idx_stride_x Stride of the index tensor in X dimension (in bytes)
46 * @param[in] idx_step_x idx_stride_x * number of elements along X processed per workitem(in bytes)
47 * @param[in] idx_offset_first_element_in_bytes The offset of the first element in the index tensor
48 */
49__kernel void digit_reverse(
50 TENSOR3D_DECLARATION(src),
51 TENSOR3D_DECLARATION(dst),
52 VECTOR_DECLARATION(idx))
53{
54 // Get tensor pointers
55 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
56 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
57 Vector idx = CONVERT_TO_VECTOR_STRUCT(idx);
58
59 const unsigned int iidx = *((__global uint *)(idx.ptr));
60
61 // Load data
62 float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
63
64 // Store result
65 vstore2(data, 0, (__global float *)dst.ptr);
66}
67
68/** Calculates and applies the twiddle factor to a given input.
69 *
70 * @param[in] phi The angle.
71 * @param[in,out] input The input on which the factor should be applied.
72 */
73#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \
74 { \
75 float2 w, tmp; \
76 w.x = native_cos(phi); \
77 w.y = native_sin(phi); \
78 tmp.x = (w.x * input.x) - (w.y * input.y); \
79 tmp.y = (w.x * input.y) + (w.y * input.x); \
80 input = tmp; \
81 }
82
83/** Computes radix-2 butterfly unit.
84 *
85 * @param[in,out] c0 Complex input 0.
86 * @param[in,out] c1 Complex input 1.
87 */
88#define DFT_2(c0, c1) \
89 { \
90 float2 v0; \
91 v0 = c0; \
92 c0 = v0 + c1; \
93 c1 = v0 - c1; \
94 }
95
96// radix-3 butterfly unit factors
97#define SQRT3DIV2 0.86602540378443f
98
99/** Computes radix-3 butterfly unit.
100 *
101 * @param[in,out] c0 Complex input 0.
102 * @param[in,out] c1 Complex input 1.
103 * @param[in,out] c2 Complex input 2.
104 */
105#define DFT_3(c0, c1, c2) \
106 { \
107 float2 v0 = c1 + c2; \
108 float2 v1 = c1 - c2; \
109 c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
110 c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
111 c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
112 c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
113 c0 = c0 + v0; \
114 }
115
116/**Computes radix-4 butterfly unit.
117 *
118 * @param[in,out] c0 Complex input 0.
119 * @param[in,out] c1 Complex input 1.
120 * @param[in,out] c2 Complex input 2.
121 * @param[in,out] c3 Complex input 3.
122 */
123#define DFT_4(c0, c1, c2, c3) \
124 { \
125 float2 v0, v1, v2, v3; \
126 v0 = c0 + c2; \
127 v1 = c1 + c3; \
128 v2 = c0 - c2; \
129 v3.x = c1.y - c3.y; \
130 v3.y = c3.x - c1.x; \
131 c0 = v0 + v1; \
132 c2 = v0 - v1; \
133 c1 = v2 + v3; \
134 c3 = v2 - v3; \
135 }
136
137// radix-5 butterfly unit factors
138#define W5_A 0.30901699437494f
139#define W5_B 0.95105651629515f
140#define W5_C 0.80901699437494f
141#define W5_D 0.58778525229247f
142
143/** Computes radix-5 butterfly unit.
144 *
145 * @param[in,out] c0 Complex input 0.
146 * @param[in,out] c1 Complex input 1.
147 * @param[in,out] c2 Complex input 2.
148 * @param[in,out] c3 Complex input 3.
149 * @param[in,out] c4 Complex input 4.
150 */
151#define DFT_5(c0, c1, c2, c3, c4) \
152 { \
153 float2 v0, v1, v2, v3, v4; \
154 v0 = c0; \
155 v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
156 v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
157 v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
158 v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
159 c0 = v0 + c1 + c2 + c3 + c4; \
160 c1 = v0 + v1 + (float2)(v4.y, -v4.x); \
161 c2 = v0 - v2 + (float2)(v3.y, -v3.x); \
162 c3 = v0 - v2 + (float2)(-v3.y, v3.x); \
163 c4 = v0 + v1 + (float2)(-v4.y, v4.x); \
164 }
165
166// radix-7 butterfly unit factors
167#define W7_A 0.62348980185873f
168#define W7_B 0.78183148246802f
169#define W7_C 0.22252093395631f
170#define W7_D 0.97492791218182f
171#define W7_E 0.90096886790241f
172#define W7_F 0.43388373911755f
173
174/** Computes radix-7 butterfly unit.
175 *
176 * @param[in,out] c0 Complex input 0.
177 * @param[in,out] c1 Complex input 1.
178 * @param[in,out] c2 Complex input 2.
179 * @param[in,out] c3 Complex input 3.
180 * @param[in,out] c4 Complex input 4.
181 * @param[in,out] c5 Complex input 5.
182 * @param[in,out] c6 Complex input 6.
183 */
184#define DFT_7(c0, c1, c2, c3, c4, c5, c6) \
185 { \
186 float2 v0, v1, v2, v3, v4, v5, v6; \
187 v0 = c0; \
188 v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
189 v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
190 v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
191 v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
192 v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
193 v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
194 c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \
195 c1 = v0 + v1 + (float2)(v4.y, -v4.x); \
196 c2 = v0 - v2 + (float2)(v5.y, -v5.x); \
197 c3 = v0 - v3 + (float2)(v6.y, -v6.x); \
198 c4 = v0 - v3 + (float2)(-v6.y, v6.x); \
199 c5 = v0 - v2 + (float2)(-v5.y, v5.x); \
200 c6 = v0 + v1 + (float2)(-v4.y, v4.x); \
201 }
202
203/** Computes radix-8 butterfly unit.
204 *
205 * @param[in,out] c0 Complex input 0.
206 * @param[in,out] c1 Complex input 1.
207 * @param[in,out] c2 Complex input 2.
208 * @param[in,out] c3 Complex input 3.
209 * @param[in,out] c4 Complex input 4.
210 * @param[in,out] c5 Complex input 5.
211 * @param[in,out] c6 Complex input 6.
212 * @param[in,out] c7 Complex input 7.
213 */
214#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
215 { \
216 float2 v0, v1, v2, v3, v4, v5, v6, v7; \
217 float2 s0, s1, s2, s3, s4, s5, s6, s7; \
218 float2 t0, t1, t2; \
219 v0 = c0 + c4; \
220 v1 = c1 + c5; \
221 v2 = c2 + c6; \
222 v3 = c3 + c7; \
223 v4 = c0 - c4; \
224 v5 = c1 - c5; \
225 v6 = c2 - c6; \
226 v7 = c3 - c7; \
227 s0 = v0 + v2; \
228 s1 = v1 + v3; \
229 s2 = v0 - v2; \
230 s3 = v1 - v3; \
231 s4.x = v4.x - v6.y; \
232 s4.y = v4.y + v6.x; \
233 s5.x = v5.x - v7.y; \
234 s5.y = v5.y + v7.x; \
235 s6.x = v4.x + v6.y; \
236 s6.y = v4.y - v6.x; \
237 s7.x = v5.x + v7.y; \
238 s7.y = v5.y - v7.x; \
239 t0.x = -s3.y; \
240 t0.y = s3.x; \
241 t1.x = M_SQRT1_2_F * (s5.x - s5.y); \
242 t1.y = M_SQRT1_2_F * (s5.x + s5.y); \
243 t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \
244 t2.y = M_SQRT1_2_F * (s7.x - s7.y); \
245 c0 = s0 + s1; \
246 c1 = s6 - t2; \
247 c2 = s2 - t0; \
248 c3 = s4 - t1; \
249 c4 = s0 - s1; \
250 c5 = s6 + t2; \
251 c6 = s2 + t0; \
252 c7 = s4 + t1; \
253 }
254
255/** Computes the first stage of a radix-2 DFT.
256 *
257 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
258 *
259 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
260 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
261 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
262 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
263 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
264 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
265 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
266 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
267 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
268 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
269 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
270 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
271 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
272 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
273 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
274 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
275 */
276kernel void fft_radix_2_first_stage_axis_0(
277 TENSOR3D_DECLARATION(input)
278#ifndef IN_PLACE
279 ,
280 TENSOR3D_DECLARATION(output)
281#endif /* not IN_PLACE */
282)
283{
284 // Get tensor pointers
285 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
286#ifdef IN_PLACE
287 Tensor3D output = input;
288#else /* IN_PLACE */
289 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
290#endif /* IN_PLACE */
291
292 // Load eight complex input values
293 float4 data = vload4(0, (__global float *)input.ptr);
294
295 // Compute DFT N = 2
296 DFT_2(data.s01, data.s23);
297
298 // Store eight complex output values
299 vstore4(data, 0, (__global float *)output.ptr);
300}
301
302/** Computes the first stage of a radix-3 DFT.
303 *
304 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
305 *
306 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
307 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
308 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
309 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
310 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
311 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
312 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
313 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
314 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
315 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
316 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
317 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
318 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
319 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
320 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
321 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
322 */
323kernel void fft_radix_3_first_stage_axis_0(
324 TENSOR3D_DECLARATION(input)
325#ifndef IN_PLACE
326 ,
327 TENSOR3D_DECLARATION(output)
328#endif /* not IN_PLACE */
329)
330{
331 // Get tensor pointers
332 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
333#ifdef IN_PLACE
334 Tensor3D output = input;
335#else /* IN_PLACE */
336 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
337#endif /* IN_PLACE */
338
339 // Load eight complex input values
340 float4 data0 = vload4(0, (__global float *)input.ptr);
341 float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));
342
343 // Compute DFT N = 3
344 DFT_3(data0.s01, data0.s23, data1.s01);
345
346 // Store eight complex output values
347 vstore4(data0, 0, (__global float *)output.ptr);
348 vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));
349}
350
351/** Computes the first stage of a radix-4 DFT.
352 *
353 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
354 *
355 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
356 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
357 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
358 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
359 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
360 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
361 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
362 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
363 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
364 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
365 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
366 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
367 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
368 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
369 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
370 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
371 */
372kernel void fft_radix_4_first_stage_axis_0(
373 TENSOR3D_DECLARATION(input)
374#ifndef IN_PLACE
375 ,
376 TENSOR3D_DECLARATION(output)
377#endif /* not IN_PLACE */
378)
379{
380 // Get tensor pointers
381 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
382#ifdef IN_PLACE
383 Tensor3D output = input;
384#else /* IN_PLACE */
385 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
386#endif /* IN_PLACE */
387
388 // Load eight complex input values
389 float8 data = vload8(0, (__global float *)input.ptr);
390
391 // Compute DFT N = 4
392 DFT_4(data.s01, data.s23, data.s45, data.s67);
393
394 // Store eight complex output values
395 vstore8(data, 0, (__global float *)output.ptr);
396}
397
398/** Computes the first stage of a radix-5 DFT.
399 *
400 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
401 *
402 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
403 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
404 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
405 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
406 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
407 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
408 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
409 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
410 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
411 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
412 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
413 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
414 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
415 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
416 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
417 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
418 */
419kernel void fft_radix_5_first_stage_axis_0(
420 TENSOR3D_DECLARATION(input)
421#ifndef IN_PLACE
422 ,
423 TENSOR3D_DECLARATION(output)
424#endif /* not IN_PLACE */
425)
426{
427 // Get tensor pointers
428 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
429#ifdef IN_PLACE
430 Tensor3D output = input;
431#else /* IN_PLACE */
432 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
433#endif /* IN_PLACE */
434
435 // Load eight complex input values
436 float8 data0 = vload8(0, (__global float *)input.ptr);
437 float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
438
439 // Compute DFT N = 5
440 DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
441
442 // Store eight complex output values
443 vstore8(data0, 0, (__global float *)output.ptr);
444 vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
445}
446
447/** Computes the first stage of a radix-7 DFT.
448 *
449 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
450 *
451 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
452 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
453 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
454 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
455 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
456 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
457 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
458 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
459 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
460 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
461 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
462 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
463 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
464 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
465 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
466 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
467 */
468kernel void fft_radix_7_first_stage_axis_0(
469 TENSOR3D_DECLARATION(input)
470#ifndef IN_PLACE
471 ,
472 TENSOR3D_DECLARATION(output)
473#endif /* not IN_PLACE */
474)
475{
476 // Get tensor pointers
477 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
478#ifdef IN_PLACE
479 Tensor3D output = input;
480#else /* IN_PLACE */
481 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
482#endif /* IN_PLACE */
483
484 // Load eight complex input values
485 float8 data0 = vload8(0, (__global float *)input.ptr);
486 float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
487 float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));
488
489 // Compute DFT N = 7
490 DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
491
492 // Store eight complex output values
493 vstore8(data0, 0, (__global float *)output.ptr);
494 vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
495 vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));
496}
497
498/** Computes the first stage of a radix-8 DFT.
499 *
500 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
501 *
502 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
503 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
504 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
505 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
506 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
507 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
508 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
509 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
510 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
511 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
512 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
513 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
514 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
515 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
516 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
517 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
518 */
519kernel void fft_radix_8_first_stage_axis_0(
520 TENSOR3D_DECLARATION(input)
521#ifndef IN_PLACE
522 ,
523 TENSOR3D_DECLARATION(output)
524#endif /* not IN_PLACE */
525)
526{
527 // Get tensor pointers
528 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
529#ifdef IN_PLACE
530 Tensor3D output = input;
531#else /* IN_PLACE */
532 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
533#endif /* IN_PLACE */
534
535 // Load eight complex input values
536 float16 data = vload16(0, (__global float *)input.ptr);
537
538 // Compute DFT N = 8
539 DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
540
541 // Store eight complex output values
542 vstore16(data, 0, (__global float *)output.ptr);
543}
544
545/** Computes a stage of a radix-2 FFT.
546 *
547 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
548 *
549 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
550 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
551 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
552 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
553 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
554 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
555 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
556 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
557 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
558 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
559 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
560 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
561 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
562 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
563 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
564 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
565 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
566 * @param[in] Ni Nx * Ny.
567 * @param[in] exp_const Exponent constant
568 */
569kernel void fft_radix_2_axis_0(
570 TENSOR3D_DECLARATION(input)
571#ifndef IN_PLACE
572 ,
573 TENSOR3D_DECLARATION(output)
574#endif /* not IN_PLACE */
575 ,
576 uint Nx, uint Ni, float exp_const)
577{
578 // Each work-item computes a single radix-2
579 uint kx = get_global_id(0);
580
581 // Compute nx
582 uint nx = kx % Nx;
583
584 // Compute n index
585 uint n = nx + (kx / Nx) * Ni;
586
587 // Get tensor pointers
588 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
589 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
590#ifdef IN_PLACE
591 Tensor3D output = input;
592#else /* IN_PLACE */
593 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
594 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
595#endif /* IN_PLACE */
596
597 // Load two complex input values
598 float2 c0 = vload2(0, (__global float *)input.ptr);
599 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
600
601 // Compute phi
602 float phi = (float)nx * exp_const;
603
604 // Multiply by twiddle factor
605 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
606
607 // Compute DFT N = 2
608 DFT_2(c0, c1);
609
610 // Store two complex output values
611 vstore2(c0, 0, (__global float *)output.ptr);
612 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
613}
614
615/** Computes a stage of a radix-3 FFT.
616 *
617 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
618 *
619 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
620 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
621 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
622 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
623 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
624 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
625 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
626 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
627 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
628 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
629 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
630 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
631 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
632 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
633 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
634 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
635 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
636 * @param[in] Ni Nx * Ny.
637 * @param[in] exp_const Exponent constant
638 */
639kernel void fft_radix_3_axis_0(
640 TENSOR3D_DECLARATION(input)
641#ifndef IN_PLACE
642 ,
643 TENSOR3D_DECLARATION(output)
644#endif /* not IN_PLACE */
645 ,
646 uint Nx, uint Ni, float exp_const)
647{
648 // Each work-item computes a single radix-3
649 uint kx = get_global_id(0);
650
651 // Compute nx
652 uint nx = kx % Nx;
653
654 // Compute n index
655 uint n = nx + (kx / Nx) * Ni;
656
657 // Get tensor pointers
658 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
659 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
660#ifdef IN_PLACE
661 Tensor3D output = input;
662#else /* IN_PLACE */
663 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
664 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
665#endif /* IN_PLACE */
666
667 // Load three complex input values
668 float2 c0 = vload2(0, (__global float *)input.ptr);
669 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
670 float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
671
672 // Compute phi
673 float phi = (float)nx * exp_const;
674
675 // Multiply by twiddle factor
676 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
677 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
678
679 // Compute DFT N = 3
680 DFT_3(c0, c1, c2);
681
682 // Store three complex output values
683 vstore2(c0, 0, (__global float *)output.ptr);
684 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
685 vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
686}
687
688/** Computes a stage of a radix-4 FFT.
689 *
690 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
691 *
692 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
693 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
694 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
695 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
696 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
697 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
698 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
699 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
700 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
701 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
702 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
703 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
704 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
705 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
706 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
707 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
708 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
709 * @param[in] Ni Nx * Ny.
710 * @param[in] exp_const Exponent constant
711 */
712kernel void fft_radix_4_axis_0(
713 TENSOR3D_DECLARATION(input)
714#ifndef IN_PLACE
715 ,
716 TENSOR3D_DECLARATION(output)
717#endif /* not IN_PLACE */
718 ,
719 uint Nx, uint Ni, float exp_const)
720{
721 // Each work-item computes a single radix-4
722 uint kx = get_global_id(0);
723
724 // Compute nx
725 uint nx = kx % Nx;
726
727 // Compute n index
728 uint n = nx + (kx / Nx) * Ni;
729
730 // Get tensor pointers
731 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
732 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
733#ifdef IN_PLACE
734 Tensor3D output = input;
735#else /* IN_PLACE */
736 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
737 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
738#endif /* IN_PLACE */
739
740 // Load four complex input values
741 float2 c0 = vload2(0, (__global float *)input.ptr);
742 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
743 float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
744 float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
745
746 // Compute phi
747 float phi = (float)nx * exp_const;
748
749 // Multiply by twiddle factor
750 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
751 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
752 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
753
754 // Compute DFT N = 4
755 DFT_4(c0, c1, c2, c3);
756
757 // Store four complex output values
758 vstore2(c0, 0, (__global float *)output.ptr);
759 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
760 vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
761 vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
762}
763
764/** Computes a stage of a radix-5 FFT.
765 *
766 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
767 *
768 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
769 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
770 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
771 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
772 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
773 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
774 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
775 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
776 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
777 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
778 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
779 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
780 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
781 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
782 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
783 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
784 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
785 * @param[in] Ni Nx * Ny.
786 * @param[in] exp_const Exponent constant
787 */
788kernel void fft_radix_5_axis_0(
789 TENSOR3D_DECLARATION(input)
790#ifndef IN_PLACE
791 ,
792 TENSOR3D_DECLARATION(output)
793#endif /* not IN_PLACE */
794 ,
795 uint Nx, uint Ni, float exp_const)
796{
797 // Each work-item computes a single radix-5
798 uint kx = get_global_id(0);
799
800 // Compute nx
801 uint nx = kx % Nx;
802
803 // Compute n index
804 uint n = nx + (kx / Nx) * Ni;
805
806 // Get tensor pointers
807 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
808 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
809#ifdef IN_PLACE
810 Tensor3D output = input;
811#else /* IN_PLACE */
812 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
813 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
814#endif /* IN_PLACE */
815
816 // Load five complex input values
817 float2 c0 = vload2(0, (__global float *)input.ptr);
818 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
819 float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
820 float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
821 float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
822
823 // Compute phi
824 float phi = (float)nx * exp_const;
825
826 // Multiply by twiddle factor
827 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
828 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
829 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
830 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
831
832 // Compute DFT N = 5
833 DFT_5(c0, c1, c2, c3, c4);
834
835 // Store five complex output values
836 vstore2(c0, 0, (__global float *)output.ptr);
837 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
838 vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
839 vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
840 vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
841}
842
843/** Computes a stage of a radix-7 FFT.
844 *
845 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
846 *
847 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
848 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
849 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
850 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
851 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
852 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
853 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
854 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
855 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
856 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
857 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
858 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
859 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
860 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
861 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
862 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
863 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
864 * @param[in] Ni Nx * Ny.
865 * @param[in] exp_const Exponent constant
866 */
867kernel void fft_radix_7_axis_0(
868 TENSOR3D_DECLARATION(input)
869#ifndef IN_PLACE
870 ,
871 TENSOR3D_DECLARATION(output)
872#endif /* not IN_PLACE */
873 ,
874 uint Nx, uint Ni, float exp_const)
875{
876 // Each work-item computes a single radix-7
877 uint kx = get_global_id(0);
878
879 // Compute nx
880 uint nx = kx % Nx;
881
882 // Compute n index
883 uint n = nx + (kx / Nx) * Ni;
884
885 // Get tensor pointers
886 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
887 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
888#ifdef IN_PLACE
889 Tensor3D output = input;
890#else /* IN_PLACE */
891 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
892 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
893#endif /* IN_PLACE */
894
895 // Load seven complex input values
896 float2 c0 = vload2(0, (__global float *)input.ptr);
897 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
898 float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
899 float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
900 float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
901 float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
902 float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
903
904 // Compute phi
905 float phi = (float)nx * exp_const;
906
907 // Multiply by twiddle factor
908 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
909 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
910 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
911 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
912 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
913 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
914
915 // Compute DFT N = 7
916 DFT_7(c0, c1, c2, c3, c4, c5, c6);
917
918 // Store seven complex output values
919 vstore2(c0, 0, (__global float *)output.ptr);
920 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
921 vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
922 vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
923 vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
924 vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
925 vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
926}
927
928/** Computes a stage of a radix-8 FFT.
929 *
930 * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
931 *
932 * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
933 * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
934 * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
935 * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
936 * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
937 * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
938 * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
939 * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
940 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
941 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
942 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
943 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
944 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
945 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
946 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
947 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
948 * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
949 * @param[in] Ni Nx * Ny.
950 * @param[in] exp_const Exponent constant
951 */
952kernel void fft_radix_8_axis_0(
953 TENSOR3D_DECLARATION(input)
954#ifndef IN_PLACE
955 ,
956 TENSOR3D_DECLARATION(output)
957#endif /* not IN_PLACE */
958 ,
959 uint Nx, uint Ni, float exp_const)
960{
961 // Each work-item computes a single radix-8
962 uint kx = get_global_id(0);
963
964 // Compute nx
965 uint nx = kx % Nx;
966
967 // Compute n index
968 uint n = nx + (kx / Nx) * Ni;
969
970 // Get tensor pointers
971 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
972 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
973#ifdef IN_PLACE
974 Tensor3D output = input;
975#else /* IN_PLACE */
976 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
977 output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
978#endif /* IN_PLACE */
979
980 // Load eight complex input values
981 float2 c0 = vload2(0, (__global float *)input.ptr);
982 float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
983 float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
984 float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
985 float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
986 float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
987 float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
988 float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));
989
990 // Compute phi
991 float phi = (float)nx * exp_const;
992
993 // Multiply by twiddle factor
994 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
995 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
996 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
997 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
998 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
999 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1000 TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
1001
1002 // Compute DFT N = 8
1003 DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
1004
1005 // Store eight complex output values
1006 vstore2(c0, 0, (__global float *)output.ptr);
1007 vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
1008 vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
1009 vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
1010 vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
1011 vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
1012 vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
1013 vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));
1014}