Blame - src/core/CL/cl_kernels/fft.cl - ml/ComputeLibrary

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame^]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

/** Computes the digit reverse stage

27

*

28

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

29

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

30

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

31

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

32

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

33

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

34

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

35

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

36

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

37

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

38

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

39

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

40

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

41

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

42

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

43

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

44

* @param[in] idx_ptr Pointer to the index tensor. Supported data types: U32

45

* @param[in] idx_stride_x Stride of the index tensor in X dimension (in bytes)

46

* @param[in] idx_step_x idx_stride_x * number of elements along X processed per workitem(in bytes)

47

* @param[in] idx_offset_first_element_in_bytes The offset of the first element in the index tensor

48

*/

49

__kernel void digit_reverse(

50

TENSOR3D_DECLARATION(src),

51

TENSOR3D_DECLARATION(dst),

52

VECTOR_DECLARATION(idx))

53

{

54

// Get tensor pointers

55

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);

56

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

57

Vector idx = CONVERT_TO_VECTOR_STRUCT(idx);

58

59

const unsigned int iidx = *((__global uint *)(idx.ptr));

60

61

// Load data

62

float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));

63

64

// Store result

65

vstore2(data, 0, (__global float *)dst.ptr);

66

}

67

68

/** Calculates and applies the twiddle factor to a given input.

69

*

70

* @param[in] phi The angle.

71

* @param[in,out] input The input on which the factor should be applied.

72

*/

73

#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \

74

{ \

75

float2 w, tmp; \

76

w.x = native_cos(phi); \

77

w.y = native_sin(phi); \

78

tmp.x = (w.x * input.x) - (w.y * input.y); \

79

tmp.y = (w.x * input.y) + (w.y * input.x); \

input = tmp; \

}

/** Computes radix-2 butterfly unit.

84

*

85

* @param[in,out] c0 Complex input 0.

86

* @param[in,out] c1 Complex input 1.

87

*/

88

#define DFT_2(c0, c1) \

{ \

float2 v0; \

v0 = c0; \

c0 = v0 + c1; \

c1 = v0 - c1; \

}

// radix-3 butterfly unit factors

97

#define SQRT3DIV2 0.86602540378443f

98

99

/** Computes radix-3 butterfly unit.

100

*

101

* @param[in,out] c0 Complex input 0.

102

* @param[in,out] c1 Complex input 1.

103

* @param[in,out] c2 Complex input 2.

104

*/

105

#define DFT_3(c0, c1, c2) \

106

{ \

107

float2 v0 = c1 + c2; \

108

float2 v1 = c1 - c2; \

109

c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \

110

c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \

111

c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \

112

c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \

c0 = c0 + v0; \

}

/**Computes radix-4 butterfly unit.

117

*

118

* @param[in,out] c0 Complex input 0.

119

* @param[in,out] c1 Complex input 1.

120

* @param[in,out] c2 Complex input 2.

121

* @param[in,out] c3 Complex input 3.

122

*/

123

#define DFT_4(c0, c1, c2, c3) \

124

{ \

125

float2 v0, v1, v2, v3; \

v0 = c0 + c2; \

v1 = c1 + c3; \

v2 = c0 - c2; \

v3.x = c1.y - c3.y; \

130

v3.y = c3.x - c1.x; \

c0 = v0 + v1; \

c2 = v0 - v1; \

c1 = v2 + v3; \

c3 = v2 - v3; \

}

// radix-5 butterfly unit factors

138

#define W5_A 0.30901699437494f

139

#define W5_B 0.95105651629515f

140

#define W5_C 0.80901699437494f

141

#define W5_D 0.58778525229247f

142

143

/** Computes radix-5 butterfly unit.

144

*

145

* @param[in,out] c0 Complex input 0.

146

* @param[in,out] c1 Complex input 1.

147

* @param[in,out] c2 Complex input 2.

148

* @param[in,out] c3 Complex input 3.

149

* @param[in,out] c4 Complex input 4.

150

*/

151

#define DFT_5(c0, c1, c2, c3, c4) \

152

{ \

153

float2 v0, v1, v2, v3, v4; \

154

v0 = c0; \

155

v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \

156

v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \

157

v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \

158

v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \

159

c0 = v0 + c1 + c2 + c3 + c4; \

160

c1 = v0 + v1 + (float2)(v4.y, -v4.x); \

161

c2 = v0 - v2 + (float2)(v3.y, -v3.x); \

162

c3 = v0 - v2 + (float2)(-v3.y, v3.x); \

163

c4 = v0 + v1 + (float2)(-v4.y, v4.x); \

164

}

165

166

// radix-7 butterfly unit factors

167

#define W7_A 0.62348980185873f

168

#define W7_B 0.78183148246802f

169

#define W7_C 0.22252093395631f

170

#define W7_D 0.97492791218182f

171

#define W7_E 0.90096886790241f

172

#define W7_F 0.43388373911755f

173

174

/** Computes radix-7 butterfly unit.

175

*

176

* @param[in,out] c0 Complex input 0.

177

* @param[in,out] c1 Complex input 1.

178

* @param[in,out] c2 Complex input 2.

179

* @param[in,out] c3 Complex input 3.

180

* @param[in,out] c4 Complex input 4.

181

* @param[in,out] c5 Complex input 5.

182

* @param[in,out] c6 Complex input 6.

183

*/

184

#define DFT_7(c0, c1, c2, c3, c4, c5, c6) \

185

{ \

186

float2 v0, v1, v2, v3, v4, v5, v6; \

187

v0 = c0; \

188

v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \

189

v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \

190

v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \

191

v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \

192

v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \

193

v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \

194

c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \

195

c1 = v0 + v1 + (float2)(v4.y, -v4.x); \

196

c2 = v0 - v2 + (float2)(v5.y, -v5.x); \

197

c3 = v0 - v3 + (float2)(v6.y, -v6.x); \

198

c4 = v0 - v3 + (float2)(-v6.y, v6.x); \

199

c5 = v0 - v2 + (float2)(-v5.y, v5.x); \

200

c6 = v0 + v1 + (float2)(-v4.y, v4.x); \

201

}

202

203

/** Computes radix-8 butterfly unit.

204

*

205

* @param[in,out] c0 Complex input 0.

206

* @param[in,out] c1 Complex input 1.

207

* @param[in,out] c2 Complex input 2.

208

* @param[in,out] c3 Complex input 3.

209

* @param[in,out] c4 Complex input 4.

210

* @param[in,out] c5 Complex input 5.

211

* @param[in,out] c6 Complex input 6.

212

* @param[in,out] c7 Complex input 7.

213

*/

214

#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \

215

{ \

216

float2 v0, v1, v2, v3, v4, v5, v6, v7; \

217

float2 s0, s1, s2, s3, s4, s5, s6, s7; \

float2 t0, t1, t2; \

v0 = c0 + c4; \

v1 = c1 + c5; \

v2 = c2 + c6; \

v3 = c3 + c7; \

v4 = c0 - c4; \

v5 = c1 - c5; \

v6 = c2 - c6; \

v7 = c3 - c7; \

s0 = v0 + v2; \

s1 = v1 + v3; \

s2 = v0 - v2; \

s3 = v1 - v3; \

s4.x = v4.x - v6.y; \

232

s4.y = v4.y + v6.x; \

233

s5.x = v5.x - v7.y; \

234

s5.y = v5.y + v7.x; \

235

s6.x = v4.x + v6.y; \

236

s6.y = v4.y - v6.x; \

237

s7.x = v5.x + v7.y; \

238

s7.y = v5.y - v7.x; \

239

t0.x = -s3.y; \

240

t0.y = s3.x; \

241

t1.x = M_SQRT1_2_F * (s5.x - s5.y); \

242

t1.y = M_SQRT1_2_F * (s5.x + s5.y); \

243

t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \

244

t2.y = M_SQRT1_2_F * (s7.x - s7.y); \

c0 = s0 + s1; \

c1 = s6 - t2; \

c2 = s2 - t0; \

c3 = s4 - t1; \

c4 = s0 - s1; \

c5 = s6 + t2; \

c6 = s2 + t0; \

c7 = s4 + t1; \

}

/** Computes the first stage of a radix-2 DFT.

256

*

257

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

258

*

259

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

260

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

261

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

262

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

263

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

264

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

265

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

266

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

267

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

268

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

269

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

270

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

271

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

272

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

273

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

274

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

275

*/

276

kernel void fft_radix_2_first_stage_axis_0(

277

TENSOR3D_DECLARATION(input)

278

#ifndef IN_PLACE

279

,

280

TENSOR3D_DECLARATION(output)

281

#endif /* not IN_PLACE */

282

)

283

{

284

// Get tensor pointers

285

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

286

#ifdef IN_PLACE

287

Tensor3D output = input;

288

#else /* IN_PLACE */

289

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

290

#endif /* IN_PLACE */

291

292

// Load eight complex input values

293

float4 data = vload4(0, (__global float *)input.ptr);

294

295

// Compute DFT N = 2

296

DFT_2(data.s01, data.s23);

297

298

// Store eight complex output values

299

vstore4(data, 0, (__global float *)output.ptr);

300

}

301

302

/** Computes the first stage of a radix-3 DFT.

303

*

304

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

305

*

306

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

307

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

308

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

309

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

310

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

311

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

312

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

313

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

314

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

315

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

316

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

317

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

318

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

319

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

320

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

321

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

322

*/

323

kernel void fft_radix_3_first_stage_axis_0(

324

TENSOR3D_DECLARATION(input)

325

#ifndef IN_PLACE

326

,

327

TENSOR3D_DECLARATION(output)

328

#endif /* not IN_PLACE */

329

)

330

{

331

// Get tensor pointers

332

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

333

#ifdef IN_PLACE

334

Tensor3D output = input;

335

#else /* IN_PLACE */

336

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

337

#endif /* IN_PLACE */

338

339

// Load eight complex input values

340

float4 data0 = vload4(0, (__global float *)input.ptr);

341

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));

342

343

// Compute DFT N = 3

344

DFT_3(data0.s01, data0.s23, data1.s01);

345

346

// Store eight complex output values

347

vstore4(data0, 0, (__global float *)output.ptr);

348

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));

349

}

350

351

/** Computes the first stage of a radix-4 DFT.

352

*

353

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

354

*

355

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

356

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

357

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

358

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

359

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

360

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

361

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

362

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

363

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

364

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

365

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

366

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

367

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

368

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

369

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

370

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

371

*/

372

kernel void fft_radix_4_first_stage_axis_0(

373

TENSOR3D_DECLARATION(input)

374

#ifndef IN_PLACE

375

,

376

TENSOR3D_DECLARATION(output)

377

#endif /* not IN_PLACE */

378

)

379

{

380

// Get tensor pointers

381

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

382

#ifdef IN_PLACE

383

Tensor3D output = input;

384

#else /* IN_PLACE */

385

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

386

#endif /* IN_PLACE */

387

388

// Load eight complex input values

389

float8 data = vload8(0, (__global float *)input.ptr);

390

391

// Compute DFT N = 4

392

DFT_4(data.s01, data.s23, data.s45, data.s67);

393

394

// Store eight complex output values

395

vstore8(data, 0, (__global float *)output.ptr);

396

}

397

398

/** Computes the first stage of a radix-5 DFT.

399

*

400

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

401

*

402

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

403

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

404

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

405

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

406

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

407

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

408

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

409

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

410

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

411

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

412

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

413

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

414

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

415

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

416

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

417

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

418

*/

419

kernel void fft_radix_5_first_stage_axis_0(

420

TENSOR3D_DECLARATION(input)

421

#ifndef IN_PLACE

422

,

423

TENSOR3D_DECLARATION(output)

424

#endif /* not IN_PLACE */

425

)

426

{

427

// Get tensor pointers

428

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

429

#ifdef IN_PLACE

430

Tensor3D output = input;

431

#else /* IN_PLACE */

432

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

433

#endif /* IN_PLACE */

434

435

// Load eight complex input values

436

float8 data0 = vload8(0, (__global float *)input.ptr);

437

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));

438

439

// Compute DFT N = 5

440

DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);

441

442

// Store eight complex output values

443

vstore8(data0, 0, (__global float *)output.ptr);

444

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));

445

}

446

447

/** Computes the first stage of a radix-7 DFT.

448

*

449

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

450

*

451

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

452

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

453

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

454

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

455

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

456

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

457

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

458

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

459

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

460

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

461

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

462

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

463

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

464

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

465

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

466

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

467

*/

468

kernel void fft_radix_7_first_stage_axis_0(

469

TENSOR3D_DECLARATION(input)

470

#ifndef IN_PLACE

471

,

472

TENSOR3D_DECLARATION(output)

473

#endif /* not IN_PLACE */

474

)

475

{

476

// Get tensor pointers

477

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

478

#ifdef IN_PLACE

479

Tensor3D output = input;

480

#else /* IN_PLACE */

481

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

482

#endif /* IN_PLACE */

483

484

// Load eight complex input values

485

float8 data0 = vload8(0, (__global float *)input.ptr);

486

float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));

487

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));

488

489

// Compute DFT N = 7

490

DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);

491

492

// Store eight complex output values

493

vstore8(data0, 0, (__global float *)output.ptr);

494

vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));

495

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));

496

}

497

498

/** Computes the first stage of a radix-8 DFT.

499

*

500

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

501

*

502

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

503

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

504

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

505

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

506

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

507

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

508

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

509

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

510

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

511

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

512

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

513

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

514

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

515

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

516

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

517

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

518

*/

519

kernel void fft_radix_8_first_stage_axis_0(

520

TENSOR3D_DECLARATION(input)

521

#ifndef IN_PLACE

522

,

523

TENSOR3D_DECLARATION(output)

524

#endif /* not IN_PLACE */

525

)

526

{

527

// Get tensor pointers

528

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

529

#ifdef IN_PLACE

530

Tensor3D output = input;

531

#else /* IN_PLACE */

532

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

533

#endif /* IN_PLACE */

534

535

// Load eight complex input values

536

float16 data = vload16(0, (__global float *)input.ptr);

537

538

// Compute DFT N = 8

539

DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);

540

541

// Store eight complex output values

542

vstore16(data, 0, (__global float *)output.ptr);

543

}

544

545

/** Computes a stage of a radix-2 FFT.

546

*

547

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

548

*

549

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

550

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

551

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

552

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

553

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

554

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

555

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

556

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

557

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

558

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

559

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

560

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

561

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

562

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

563

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

564

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

565

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

566

* @param[in] Ni Nx * Ny.

567

* @param[in] exp_const Exponent constant

568

*/

569

kernel void fft_radix_2_axis_0(

570

TENSOR3D_DECLARATION(input)

571

#ifndef IN_PLACE

572

,

573

TENSOR3D_DECLARATION(output)

574

#endif /* not IN_PLACE */

575

,

576

uint Nx, uint Ni, float exp_const)

577

{

578

// Each work-item computes a single radix-2

579

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

586

587

// Get tensor pointers

588

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

589

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

590

#ifdef IN_PLACE

591

Tensor3D output = input;

592

#else /* IN_PLACE */

593

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

594

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

595

#endif /* IN_PLACE */

596

597

// Load two complex input values

598

float2 c0 = vload2(0, (__global float *)input.ptr);

599

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

600

601

// Compute phi

602

float phi = (float)nx * exp_const;

603

604

// Multiply by twiddle factor

605

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

// Compute DFT N = 2

DFT_2(c0, c1);

// Store two complex output values

611

vstore2(c0, 0, (__global float *)output.ptr);

612

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

613

}

614

615

/** Computes a stage of a radix-3 FFT.

616

*

617

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

618

*

619

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

620

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

621

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

622

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

623

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

624

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

625

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

626

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

627

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

628

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

629

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

630

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

631

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

632

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

633

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

634

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

635

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

636

* @param[in] Ni Nx * Ny.

637

* @param[in] exp_const Exponent constant

638

*/

639

kernel void fft_radix_3_axis_0(

640

TENSOR3D_DECLARATION(input)

641

#ifndef IN_PLACE

642

,

643

TENSOR3D_DECLARATION(output)

644

#endif /* not IN_PLACE */

645

,

646

uint Nx, uint Ni, float exp_const)

647

{

648

// Each work-item computes a single radix-3

649

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

656

657

// Get tensor pointers

658

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

659

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

660

#ifdef IN_PLACE

661

Tensor3D output = input;

662

#else /* IN_PLACE */

663

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

664

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

665

#endif /* IN_PLACE */

666

667

// Load three complex input values

668

float2 c0 = vload2(0, (__global float *)input.ptr);

669

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

670

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

671

672

// Compute phi

673

float phi = (float)nx * exp_const;

674

675

// Multiply by twiddle factor

676

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

677

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

// Compute DFT N = 3

DFT_3(c0, c1, c2);

// Store three complex output values

683

vstore2(c0, 0, (__global float *)output.ptr);

684

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

685

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

686

}

687

688

/** Computes a stage of a radix-4 FFT.

689

*

690

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

691

*

692

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

693

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

694

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

695

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

696

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

697

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

698

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

699

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

700

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

701

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

702

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

703

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

704

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

705

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

706

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

707

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

708

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

709

* @param[in] Ni Nx * Ny.

710

* @param[in] exp_const Exponent constant

711

*/

712

kernel void fft_radix_4_axis_0(

713

TENSOR3D_DECLARATION(input)

714

#ifndef IN_PLACE

715

,

716

TENSOR3D_DECLARATION(output)

717

#endif /* not IN_PLACE */

718

,

719

uint Nx, uint Ni, float exp_const)

720

{

721

// Each work-item computes a single radix-4

722

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

729

730

// Get tensor pointers

731

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

732

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

733

#ifdef IN_PLACE

734

Tensor3D output = input;

735

#else /* IN_PLACE */

736

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

737

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

738

#endif /* IN_PLACE */

739

740

// Load four complex input values

741

float2 c0 = vload2(0, (__global float *)input.ptr);

742

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

743

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

744

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

745

746

// Compute phi

747

float phi = (float)nx * exp_const;

748

749

// Multiply by twiddle factor

750

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

751

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

752

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

753

754

// Compute DFT N = 4

755

DFT_4(c0, c1, c2, c3);

756

757

// Store four complex output values

758

vstore2(c0, 0, (__global float *)output.ptr);

759

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

760

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

761

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

762

}

763

764

/** Computes a stage of a radix-5 FFT.

765

*

766

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

767

*

768

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

769

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

770

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

771

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

772

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

773

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

774

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

775

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

776

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

777

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

778

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

779

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

780

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

781

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

782

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

783

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

784

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

785

* @param[in] Ni Nx * Ny.

786

* @param[in] exp_const Exponent constant

787

*/

788

kernel void fft_radix_5_axis_0(

789

TENSOR3D_DECLARATION(input)

790

#ifndef IN_PLACE

791

,

792

TENSOR3D_DECLARATION(output)

793

#endif /* not IN_PLACE */

794

,

795

uint Nx, uint Ni, float exp_const)

796

{

797

// Each work-item computes a single radix-5

798

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

805

806

// Get tensor pointers

807

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

808

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

809

#ifdef IN_PLACE

810

Tensor3D output = input;

811

#else /* IN_PLACE */

812

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

813

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

814

#endif /* IN_PLACE */

815

816

// Load five complex input values

817

float2 c0 = vload2(0, (__global float *)input.ptr);

818

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

819

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

820

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

821

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

822

823

// Compute phi

824

float phi = (float)nx * exp_const;

825

826

// Multiply by twiddle factor

827

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

828

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

829

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

830

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

831

832

// Compute DFT N = 5

833

DFT_5(c0, c1, c2, c3, c4);

834

835

// Store five complex output values

836

vstore2(c0, 0, (__global float *)output.ptr);

837

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

838

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

839

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

840

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

841

}

842

843

/** Computes a stage of a radix-7 FFT.

844

*

845

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

846

*

847

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

848

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

849

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

850

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

851

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

852

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

853

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

854

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

855

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

856

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

857

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

858

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

859

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

860

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

861

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

862

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

863

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

864

* @param[in] Ni Nx * Ny.

865

* @param[in] exp_const Exponent constant

866

*/

867

kernel void fft_radix_7_axis_0(

868

TENSOR3D_DECLARATION(input)

869

#ifndef IN_PLACE

870

,

871

TENSOR3D_DECLARATION(output)

872

#endif /* not IN_PLACE */

873

,

874

uint Nx, uint Ni, float exp_const)

875

{

876

// Each work-item computes a single radix-7

877

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

884

885

// Get tensor pointers

886

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

887

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

888

#ifdef IN_PLACE

889

Tensor3D output = input;

890

#else /* IN_PLACE */

891

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

892

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

893

#endif /* IN_PLACE */

894

895

// Load seven complex input values

896

float2 c0 = vload2(0, (__global float *)input.ptr);

897

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

898

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

899

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

900

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

901

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));

902

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));

903

904

// Compute phi

905

float phi = (float)nx * exp_const;

906

907

// Multiply by twiddle factor

908

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

909

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

910

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

911

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

912

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

913

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

914

915

// Compute DFT N = 7

916

DFT_7(c0, c1, c2, c3, c4, c5, c6);

917

918

// Store seven complex output values

919

vstore2(c0, 0, (__global float *)output.ptr);

920

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

921

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

922

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

923

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

924

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));

925

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));

926

}

927

928

/** Computes a stage of a radix-8 FFT.

929

*

930

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

931

*

932

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

933

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

934

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

935

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

936

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

937

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

938

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

939

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

940

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

941

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

942

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

943

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

944

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

945

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

946

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

947

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

948

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

949

* @param[in] Ni Nx * Ny.

950

* @param[in] exp_const Exponent constant

951

*/

952

kernel void fft_radix_8_axis_0(

953

TENSOR3D_DECLARATION(input)

954

#ifndef IN_PLACE

955

,

956

TENSOR3D_DECLARATION(output)

957

#endif /* not IN_PLACE */

958

,

959

uint Nx, uint Ni, float exp_const)

960

{

961

// Each work-item computes a single radix-8

962

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

969

970

// Get tensor pointers

971

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

972

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

973

#ifdef IN_PLACE

974

Tensor3D output = input;

975

#else /* IN_PLACE */

976

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

977

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

978

#endif /* IN_PLACE */

979

980

// Load eight complex input values

981

float2 c0 = vload2(0, (__global float *)input.ptr);

982

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

983

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

984

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

985

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

986

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));

987

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));

988

float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));

989

990

// Compute phi

991

float phi = (float)nx * exp_const;

992

993

// Multiply by twiddle factor

994

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

995

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

996

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

997

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

998

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

999

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

1000

TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);

1001

1002

// Compute DFT N = 8

1003

DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);

1004

1005

// Store eight complex output values

1006

vstore2(c0, 0, (__global float *)output.ptr);

1007

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1008

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1009

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

1010

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

1011

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));

1012

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));

1013

vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));

1014

}