Blame - src/core/CL/cl_kernels/fft.cl - ml/ComputeLibrary

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

Georgios Pinitas

2019-03-18 20:07:37 +0000

[diff] [blame]

26

/** Calculates and applies the twiddle factor to a given input.

27

*

28

* @param[in] phi The angle.

29

* @param[in,out] input The input on which the factor should be applied.

30

*/

31

#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \

32

{ \

33

float2 w, tmp; \

34

w.x = native_cos(phi); \

35

w.y = native_sin(phi); \

36

tmp.x = (w.x * input.x) - (w.y * input.y); \

37

tmp.y = (w.x * input.y) + (w.y * input.x); \

input = tmp; \

}

/** Computes radix-2 butterfly unit.

42

*

43

* @param[in,out] c0 Complex input 0.

44

* @param[in,out] c1 Complex input 1.

45

*/

46

#define DFT_2(c0, c1) \

{ \

float2 v0; \

v0 = c0; \

c0 = v0 + c1; \

c1 = v0 - c1; \

}

// radix-3 butterfly unit factors

55

#define SQRT3DIV2 0.86602540378443f

56

57

/** Computes radix-3 butterfly unit.

58

*

59

* @param[in,out] c0 Complex input 0.

60

* @param[in,out] c1 Complex input 1.

61

* @param[in,out] c2 Complex input 2.

62

*/

63

#define DFT_3(c0, c1, c2) \

64

{ \

65

float2 v0 = c1 + c2; \

66

float2 v1 = c1 - c2; \

67

c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \

68

c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \

69

c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \

70

c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \

c0 = c0 + v0; \

}

/**Computes radix-4 butterfly unit.

75

*

76

* @param[in,out] c0 Complex input 0.

77

* @param[in,out] c1 Complex input 1.

78

* @param[in,out] c2 Complex input 2.

79

* @param[in,out] c3 Complex input 3.

80

*/

81

#define DFT_4(c0, c1, c2, c3) \

82

{ \

83

float2 v0, v1, v2, v3; \

v0 = c0 + c2; \

v1 = c1 + c3; \

v2 = c0 - c2; \

v3.x = c1.y - c3.y; \

88

v3.y = c3.x - c1.x; \

c0 = v0 + v1; \

c2 = v0 - v1; \

c1 = v2 + v3; \

c3 = v2 - v3; \

}

// radix-5 butterfly unit factors

96

#define W5_A 0.30901699437494f

97

#define W5_B 0.95105651629515f

98

#define W5_C 0.80901699437494f

99

#define W5_D 0.58778525229247f

100

101

/** Computes radix-5 butterfly unit.

102

*

103

* @param[in,out] c0 Complex input 0.

104

* @param[in,out] c1 Complex input 1.

105

* @param[in,out] c2 Complex input 2.

106

* @param[in,out] c3 Complex input 3.

107

* @param[in,out] c4 Complex input 4.

108

*/

109

#define DFT_5(c0, c1, c2, c3, c4) \

110

{ \

111

float2 v0, v1, v2, v3, v4; \

112

v0 = c0; \

113

v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \

114

v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \

115

v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \

116

v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \

117

c0 = v0 + c1 + c2 + c3 + c4; \

118

c1 = v0 + v1 + (float2)(v4.y, -v4.x); \

119

c2 = v0 - v2 + (float2)(v3.y, -v3.x); \

120

c3 = v0 - v2 + (float2)(-v3.y, v3.x); \

121

c4 = v0 + v1 + (float2)(-v4.y, v4.x); \

122

}

123

124

// radix-7 butterfly unit factors

125

#define W7_A 0.62348980185873f

126

#define W7_B 0.78183148246802f

127

#define W7_C 0.22252093395631f

128

#define W7_D 0.97492791218182f

129

#define W7_E 0.90096886790241f

130

#define W7_F 0.43388373911755f

131

132

/** Computes radix-7 butterfly unit.

133

*

134

* @param[in,out] c0 Complex input 0.

135

* @param[in,out] c1 Complex input 1.

136

* @param[in,out] c2 Complex input 2.

137

* @param[in,out] c3 Complex input 3.

138

* @param[in,out] c4 Complex input 4.

139

* @param[in,out] c5 Complex input 5.

140

* @param[in,out] c6 Complex input 6.

141

*/

142

#define DFT_7(c0, c1, c2, c3, c4, c5, c6) \

143

{ \

144

float2 v0, v1, v2, v3, v4, v5, v6; \

145

v0 = c0; \

146

v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \

147

v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \

148

v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \

149

v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \

150

v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \

151

v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \

152

c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \

153

c1 = v0 + v1 + (float2)(v4.y, -v4.x); \

154

c2 = v0 - v2 + (float2)(v5.y, -v5.x); \

155

c3 = v0 - v3 + (float2)(v6.y, -v6.x); \

156

c4 = v0 - v3 + (float2)(-v6.y, v6.x); \

157

c5 = v0 - v2 + (float2)(-v5.y, v5.x); \

158

c6 = v0 + v1 + (float2)(-v4.y, v4.x); \

159

}

160

161

/** Computes radix-8 butterfly unit.

162

*

163

* @param[in,out] c0 Complex input 0.

164

* @param[in,out] c1 Complex input 1.

165

* @param[in,out] c2 Complex input 2.

166

* @param[in,out] c3 Complex input 3.

167

* @param[in,out] c4 Complex input 4.

168

* @param[in,out] c5 Complex input 5.

169

* @param[in,out] c6 Complex input 6.

170

* @param[in,out] c7 Complex input 7.

171

*/

172

#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \

173

{ \

174

float2 v0, v1, v2, v3, v4, v5, v6, v7; \

175

float2 s0, s1, s2, s3, s4, s5, s6, s7; \

float2 t0, t1, t2; \

v0 = c0 + c4; \

v1 = c1 + c5; \

v2 = c2 + c6; \

v3 = c3 + c7; \

v4 = c0 - c4; \

v5 = c1 - c5; \

v6 = c2 - c6; \

v7 = c3 - c7; \

s0 = v0 + v2; \

s1 = v1 + v3; \

s2 = v0 - v2; \

s3 = v1 - v3; \

s4.x = v4.x - v6.y; \

190

s4.y = v4.y + v6.x; \

191

s5.x = v5.x - v7.y; \

192

s5.y = v5.y + v7.x; \

193

s6.x = v4.x + v6.y; \

194

s6.y = v4.y - v6.x; \

195

s7.x = v5.x + v7.y; \

196

s7.y = v5.y - v7.x; \

197

t0.x = -s3.y; \

198

t0.y = s3.x; \

199

t1.x = M_SQRT1_2_F * (s5.x - s5.y); \

200

t1.y = M_SQRT1_2_F * (s5.x + s5.y); \

201

t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \

202

t2.y = M_SQRT1_2_F * (s7.x - s7.y); \

c0 = s0 + s1; \

c1 = s6 - t2; \

c2 = s2 - t0; \

c3 = s4 - t1; \

c4 = s0 - s1; \

c5 = s6 + t2; \

c6 = s2 + t0; \

c7 = s4 + t1; \

}

Georgios Pinitas

2019-03-26 17:23:28 +0000

[diff] [blame^]

213

/** Computes the first stage of a radix-2 DFT on axis 0.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

214

*

215

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

216

*

217

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

218

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

219

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

220

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

221

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

222

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

223

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

224

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

225

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

226

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

227

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

228

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

229

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

230

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

231

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

232

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

233

*/

234

kernel void fft_radix_2_first_stage_axis_0(

235

TENSOR3D_DECLARATION(input)

236

#ifndef IN_PLACE

237

,

238

TENSOR3D_DECLARATION(output)

239

#endif /* not IN_PLACE */

240

)

241

{

242

// Get tensor pointers

243

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

244

#ifdef IN_PLACE

245

Tensor3D output = input;

246

#else /* IN_PLACE */

247

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

248

#endif /* IN_PLACE */

249

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

250

// Load two complex input values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

251

float4 data = vload4(0, (__global float *)input.ptr);

252

253

// Compute DFT N = 2

254

DFT_2(data.s01, data.s23);

255

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

256

// Store two complex output values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

257

vstore4(data, 0, (__global float *)output.ptr);

258

}

259

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

260

/** Computes the first stage of a radix-2 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

261

*

262

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

263

*

264

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

265

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

266

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

268

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

270

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

272

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

273

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

274

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

276

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

278

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

280

*/

281

kernel void fft_radix_2_first_stage_axis_1(

282

TENSOR3D_DECLARATION(input)

283

#ifndef IN_PLACE

284

,

285

TENSOR3D_DECLARATION(output)

286

#endif /* not IN_PLACE */

287

)

288

{

289

// Get tensor pointers

290

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

291

#ifdef IN_PLACE

292

Tensor3D output = input;

293

#else /* IN_PLACE */

294

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

295

#endif /* IN_PLACE */

296

297

// Load two complex input values

298

float2 data1 = vload2(0, (__global float *)input.ptr);

299

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

// Compute DFT N = 2

DFT_2(data1, data2);

// Store two complex output values

305

vstore2(data1, 0, (__global float *)output.ptr);

306

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

307

}

308

309

/** Computes the first stage of a radix-3 DFT on axis 0.

310

*

311

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

312

*

313

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

314

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

315

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

316

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

317

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

318

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

319

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

320

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

321

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

322

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

323

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

324

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

325

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

326

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

327

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

328

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

329

*/

330

kernel void fft_radix_3_first_stage_axis_0(

331

TENSOR3D_DECLARATION(input)

332

#ifndef IN_PLACE

333

,

334

TENSOR3D_DECLARATION(output)

335

#endif /* not IN_PLACE */

336

)

337

{

338

// Get tensor pointers

339

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

340

#ifdef IN_PLACE

341

Tensor3D output = input;

342

#else /* IN_PLACE */

343

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

344

#endif /* IN_PLACE */

345

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

346

// Load three complex input values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

347

float4 data0 = vload4(0, (__global float *)input.ptr);

348

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));

349

350

// Compute DFT N = 3

351

DFT_3(data0.s01, data0.s23, data1.s01);

352

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

353

// Store three complex output values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

354

vstore4(data0, 0, (__global float *)output.ptr);

355

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));

356

}

357

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

358

/** Computes the first stage of a radix-3 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

359

*

360

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

361

*

362

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

363

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

364

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

365

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

366

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

367

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

368

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

369

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

370

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

371

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

372

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

373

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

374

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

375

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

376

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

377

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

378

*/

379

kernel void fft_radix_3_first_stage_axis_1(

380

TENSOR3D_DECLARATION(input)

381

#ifndef IN_PLACE

382

,

383

TENSOR3D_DECLARATION(output)

384

#endif /* not IN_PLACE */

385

)

386

{

387

// Get tensor pointers

388

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

389

#ifdef IN_PLACE

390

Tensor3D output = input;

391

#else /* IN_PLACE */

392

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

393

#endif /* IN_PLACE */

394

395

// Load three complex input values

396

float2 data0 = vload2(0, (__global float *)input.ptr);

397

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

398

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));

399

400

// Compute DFT N = 3

401

DFT_3(data0, data1, data2);

402

403

// Store three complex output values

404

vstore2(data0, 0, (__global float *)output.ptr);

405

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

406

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));

407

}

408

409

/** Computes the first stage of a radix-4 DFT on axis 0.

410

*

411

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

412

*

413

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

414

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

415

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

416

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

417

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

418

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

419

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

420

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

421

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

422

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

423

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

424

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

425

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

426

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

427

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

428

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

429

*/

430

kernel void fft_radix_4_first_stage_axis_0(

431

TENSOR3D_DECLARATION(input)

432

#ifndef IN_PLACE

433

,

434

TENSOR3D_DECLARATION(output)

435

#endif /* not IN_PLACE */

436

)

437

{

438

// Get tensor pointers

439

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

440

#ifdef IN_PLACE

441

Tensor3D output = input;

442

#else /* IN_PLACE */

443

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

444

#endif /* IN_PLACE */

445

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

446

// Load four complex input values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

447

float8 data = vload8(0, (__global float *)input.ptr);

448

449

// Compute DFT N = 4

450

DFT_4(data.s01, data.s23, data.s45, data.s67);

451

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

452

// Store four complex output values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

453

vstore8(data, 0, (__global float *)output.ptr);

454

}

455

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

456

/** Computes the first stage of a radix-4 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

457

*

458

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

459

*

460

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

461

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

462

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

463

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

464

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

465

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

466

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

467

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

468

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

469

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

470

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

471

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

472

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

473

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

474

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

475

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

476

*/

477

kernel void fft_radix_4_first_stage_axis_1(

478

TENSOR3D_DECLARATION(input)

479

#ifndef IN_PLACE

480

,

481

TENSOR3D_DECLARATION(output)

482

#endif /* not IN_PLACE */

483

)

484

{

485

// Get tensor pointers

486

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

487

#ifdef IN_PLACE

488

Tensor3D output = input;

489

#else /* IN_PLACE */

490

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

491

#endif /* IN_PLACE */

492

493

// Load four complex input values

494

float2 data0 = vload2(0, (__global float *)input.ptr);

495

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

496

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));

497

float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));

498

499

// Compute DFT N = 4

500

DFT_4(data0, data1, data2, data3);

501

502

// Store four complex output values

503

vstore2(data0, 0, (__global float *)output.ptr);

504

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

505

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));

506

vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));

507

}

508

509

/** Computes the first stage of a radix-5 DFT on axis 0.

510

*

511

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

512

*

513

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

514

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

515

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

516

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

517

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

518

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

519

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

520

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

521

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

522

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

523

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

524

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

525

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

526

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

527

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

528

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

529

*/

530

kernel void fft_radix_5_first_stage_axis_0(

531

TENSOR3D_DECLARATION(input)

532

#ifndef IN_PLACE

533

,

534

TENSOR3D_DECLARATION(output)

535

#endif /* not IN_PLACE */

536

)

537

{

538

// Get tensor pointers

539

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

540

#ifdef IN_PLACE

541

Tensor3D output = input;

542

#else /* IN_PLACE */

543

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

544

#endif /* IN_PLACE */

545

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

546

// Load five complex input values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

547

float8 data0 = vload8(0, (__global float *)input.ptr);

548

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));

549

550

// Compute DFT N = 5

551

DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);

552

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

553

// Store five complex output values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

554

vstore8(data0, 0, (__global float *)output.ptr);

555

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));

556

}

557

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

558

/** Computes the first stage of a radix-5 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

559

*

560

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

561

*

562

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

563

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

564

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

565

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

566

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

567

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

568

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

569

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

570

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

571

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

572

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

573

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

574

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

575

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

576

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

577

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

578

*/

579

kernel void fft_radix_5_first_stage_axis_1(

580

TENSOR3D_DECLARATION(input)

581

#ifndef IN_PLACE

582

,

583

TENSOR3D_DECLARATION(output)

584

#endif /* not IN_PLACE */

585

)

586

{

587

// Get tensor pointers

588

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

589

#ifdef IN_PLACE

590

Tensor3D output = input;

591

#else /* IN_PLACE */

592

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

593

#endif /* IN_PLACE */

594

595

// Load five complex input values

596

float2 data0 = vload2(0, (__global float *)input.ptr);

597

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

598

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));

599

float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));

600

float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));

601

602

// Compute DFT N = 5

603

DFT_5(data0, data1, data2, data3, data4);

604

605

// Store five complex output values

606

vstore2(data0, 0, (__global float *)output.ptr);

607

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

608

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));

609

vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));

610

vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));

611

}

612

613

/** Computes the first stage of a radix-7 DFT on axis 0.

614

*

615

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

616

*

617

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

618

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

619

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

620

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

621

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

622

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

623

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

624

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

625

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

626

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

627

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

628

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

629

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

630

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

631

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

632

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

633

*/

634

kernel void fft_radix_7_first_stage_axis_0(

635

TENSOR3D_DECLARATION(input)

636

#ifndef IN_PLACE

637

,

638

TENSOR3D_DECLARATION(output)

639

#endif /* not IN_PLACE */

640

)

641

{

642

// Get tensor pointers

643

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

644

#ifdef IN_PLACE

645

Tensor3D output = input;

646

#else /* IN_PLACE */

647

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

648

#endif /* IN_PLACE */

649

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

650

// Load seven complex input values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

651

float8 data0 = vload8(0, (__global float *)input.ptr);

652

float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));

653

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));

654

655

// Compute DFT N = 7

656

DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);

657

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

658

// Store seven complex output values

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

659

vstore8(data0, 0, (__global float *)output.ptr);

660

vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));

661

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));

662

}

663

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

664

/** Computes the first stage of a radix-7 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

665

*

666

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

667

*

668

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

669

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

670

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

671

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

672

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

673

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

674

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

675

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

676

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

677

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

678

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

679

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

680

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

681

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

682

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

683

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

684

*/

685

kernel void fft_radix_7_first_stage_axis_1(

686

TENSOR3D_DECLARATION(input)

687

#ifndef IN_PLACE

688

,

689

TENSOR3D_DECLARATION(output)

690

#endif /* not IN_PLACE */

691

)

692

{

693

// Get tensor pointers

694

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

695

#ifdef IN_PLACE

696

Tensor3D output = input;

697

#else /* IN_PLACE */

698

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

699

#endif /* IN_PLACE */

700

701

// Load seven complex input values

702

float2 data0 = vload2(0, (__global float *)input.ptr);

703

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

704

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));

705

float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));

706

float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));

707

float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));

708

float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));

709

710

// Compute DFT N = 7

711

DFT_7(data0, data1, data2, data3, data4, data5, data6);

712

713

// Store seven complex output values

714

vstore2(data0, 0, (__global float *)output.ptr);

715

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

716

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));

717

vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));

718

vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));

719

vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));

720

vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));

721

}

722

723

/** Computes the first stage of a radix-8 DFT on axis 0.

724

*

725

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

726

*

727

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

728

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

729

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

730

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

731

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

732

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

733

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

734

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

735

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

736

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

737

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

738

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

739

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

740

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

741

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

742

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

743

*/

744

kernel void fft_radix_8_first_stage_axis_0(

745

TENSOR3D_DECLARATION(input)

746

#ifndef IN_PLACE

747

,

748

TENSOR3D_DECLARATION(output)

749

#endif /* not IN_PLACE */

750

)

751

{

752

// Get tensor pointers

753

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

754

#ifdef IN_PLACE

755

Tensor3D output = input;

756

#else /* IN_PLACE */

757

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

758

#endif /* IN_PLACE */

759

760

// Load eight complex input values

761

float16 data = vload16(0, (__global float *)input.ptr);

762

763

// Compute DFT N = 8

764

DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);

765

766

// Store eight complex output values

767

vstore16(data, 0, (__global float *)output.ptr);

768

}

769

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

770

/** Computes the first stage of a radix-8 DFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

771

*

772

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

773

*

774

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

775

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

776

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

777

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

778

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

779

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

780

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

781

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

782

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

783

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

784

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

785

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

786

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

787

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

788

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

789

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

790

*/

791

kernel void fft_radix_8_first_stage_axis_1(

792

TENSOR3D_DECLARATION(input)

793

#ifndef IN_PLACE

794

,

795

TENSOR3D_DECLARATION(output)

796

#endif /* not IN_PLACE */

797

)

798

{

799

// Get tensor pointers

800

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

801

#ifdef IN_PLACE

802

Tensor3D output = input;

803

#else /* IN_PLACE */

804

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

805

#endif /* IN_PLACE */

806

807

// Load eight complex input values

808

float2 data0 = vload2(0, (__global float *)input.ptr);

809

float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));

810

float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));

811

float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));

812

float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));

813

float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));

814

float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));

815

float2 data7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7, 0));

816

817

// Compute DFT N = 8

818

DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);

819

820

// Store eight complex output values

821

vstore2(data0, 0, (__global float *)output.ptr);

822

vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));

823

vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));

824

vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));

825

vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));

826

vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));

827

vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));

828

vstore2(data7, 0, (__global float *)tensor3D_offset(&output, 0, 7, 0));

829

}

830

831

/** Computes a stage of a radix-2 FFT on axis 0.

832

*

833

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

834

*

835

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

836

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

837

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

838

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

839

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

840

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

841

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

842

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

843

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

844

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

845

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

846

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

847

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

848

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

849

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

850

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

851

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

852

* @param[in] Ni Nx * Ny.

853

* @param[in] exp_const Exponent constant

854

*/

855

kernel void fft_radix_2_axis_0(

856

TENSOR3D_DECLARATION(input)

857

#ifndef IN_PLACE

858

,

859

TENSOR3D_DECLARATION(output)

860

#endif /* not IN_PLACE */

861

,

862

uint Nx, uint Ni, float exp_const)

863

{

864

// Each work-item computes a single radix-2

865

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

872

873

// Get tensor pointers

874

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

875

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

876

#ifdef IN_PLACE

877

Tensor3D output = input;

878

#else /* IN_PLACE */

879

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

880

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

881

#endif /* IN_PLACE */

882

883

// Load two complex input values

884

float2 c0 = vload2(0, (__global float *)input.ptr);

885

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

886

887

// Compute phi

888

float phi = (float)nx * exp_const;

889

890

// Multiply by twiddle factor

891

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

// Compute DFT N = 2

DFT_2(c0, c1);

// Store two complex output values

897

vstore2(c0, 0, (__global float *)output.ptr);

898

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

899

}

900

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

901

/** Computes a stage of a radix-2 FFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

902

*

903

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

904

*

905

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

906

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

907

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

908

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

909

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

910

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

911

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

912

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

913

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

914

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

915

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

916

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

917

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

918

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

919

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

920

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

921

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

922

* @param[in] Ni Nx * Ny.

923

* @param[in] exp_const Exponent constant

924

*/

925

kernel void fft_radix_2_axis_1(

926

TENSOR3D_DECLARATION(input)

927

#ifndef IN_PLACE

928

,

929

TENSOR3D_DECLARATION(output)

930

#endif /* not IN_PLACE */

931

,

932

uint Nx, uint Ni, float exp_const)

933

{

934

// Each work-item computes a single radix-2

935

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

942

943

// Get tensor pointers

944

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

945

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

946

#ifdef IN_PLACE

947

Tensor3D output = input;

948

#else /* IN_PLACE */

949

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

950

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

951

#endif /* IN_PLACE */

952

953

// Load two complex input values

954

float2 c0 = vload2(0, (__global float *)input.ptr);

955

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

956

957

// Compute phi

958

float phi = (float)nx * exp_const;

959

960

// Multiply by twiddle factor

961

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

// Compute DFT N = 2

DFT_2(c0, c1);

// Store two complex output values

967

vstore2(c0, 0, (__global float *)output.ptr);

968

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

969

}

970

971

/** Computes a stage of a radix-3 FFT on axis 0.

972

*

973

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

974

*

975

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

976

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

977

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

978

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

979

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

980

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

981

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

982

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

983

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

984

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

985

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

986

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

987

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

988

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

989

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

990

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

991

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

992

* @param[in] Ni Nx * Ny.

993

* @param[in] exp_const Exponent constant

994

*/

995

kernel void fft_radix_3_axis_0(

996

TENSOR3D_DECLARATION(input)

997

#ifndef IN_PLACE

998

,

999

TENSOR3D_DECLARATION(output)

1000

#endif /* not IN_PLACE */

1001

,

1002

uint Nx, uint Ni, float exp_const)

1003

{

1004

// Each work-item computes a single radix-3

1005

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1012

1013

// Get tensor pointers

1014

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1015

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

1016

#ifdef IN_PLACE

1017

Tensor3D output = input;

1018

#else /* IN_PLACE */

1019

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1020

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

1021

#endif /* IN_PLACE */

1022

1023

// Load three complex input values

1024

float2 c0 = vload2(0, (__global float *)input.ptr);

1025

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

1026

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

1027

1028

// Compute phi

1029

float phi = (float)nx * exp_const;

1030

1031

// Multiply by twiddle factor

1032

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1033

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

// Compute DFT N = 3

DFT_3(c0, c1, c2);

// Store three complex output values

1039

vstore2(c0, 0, (__global float *)output.ptr);

1040

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1041

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1042

}

1043

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1044

/** Computes a stage of a radix-3 FFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1045

*

1046

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1047

*

1048

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1049

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1050

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1051

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1052

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1053

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1054

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1055

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1056

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1057

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1058

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1059

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1060

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1061

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1062

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1063

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

1064

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1065

* @param[in] Ni Nx * Ny.

1066

* @param[in] exp_const Exponent constant

1067

*/

1068

kernel void fft_radix_3_axis_1(

1069

TENSOR3D_DECLARATION(input)

1070

#ifndef IN_PLACE

1071

,

1072

TENSOR3D_DECLARATION(output)

1073

#endif /* not IN_PLACE */

1074

,

1075

uint Nx, uint Ni, float exp_const)

1076

{

1077

// Each work-item computes a single radix-3

1078

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1085

1086

// Get tensor pointers

1087

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1088

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

1089

#ifdef IN_PLACE

1090

Tensor3D output = input;

1091

#else /* IN_PLACE */

1092

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1093

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

1094

#endif /* IN_PLACE */

1095

1096

// Load three complex input values

1097

float2 c0 = vload2(0, (__global float *)input.ptr);

1098

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

1099

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));

1100

1101

// Compute phi

1102

float phi = (float)nx * exp_const;

1103

1104

// Multiply by twiddle factor

1105

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1106

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

// Compute DFT N = 3

DFT_3(c0, c1, c2);

// Store three complex output values

1112

vstore2(c0, 0, (__global float *)output.ptr);

1113

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

1114

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));

1115

}

1116

1117

/** Computes a stage of a radix-4 FFT on axis 0.

1118

*

1119

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1120

*

1121

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1122

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1123

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1124

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1125

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1126

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1127

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1128

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

1129

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1130

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1131

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1132

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1133

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1134

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1135

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1136

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1137

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1138

* @param[in] Ni Nx * Ny.

1139

* @param[in] exp_const Exponent constant

1140

*/

1141

kernel void fft_radix_4_axis_0(

1142

TENSOR3D_DECLARATION(input)

1143

#ifndef IN_PLACE

1144

,

1145

TENSOR3D_DECLARATION(output)

1146

#endif /* not IN_PLACE */

1147

,

1148

uint Nx, uint Ni, float exp_const)

1149

{

1150

// Each work-item computes a single radix-4

1151

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1158

1159

// Get tensor pointers

1160

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1161

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

1162

#ifdef IN_PLACE

1163

Tensor3D output = input;

1164

#else /* IN_PLACE */

1165

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1166

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

1167

#endif /* IN_PLACE */

1168

1169

// Load four complex input values

1170

float2 c0 = vload2(0, (__global float *)input.ptr);

1171

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

1172

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

1173

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

1174

1175

// Compute phi

1176

float phi = (float)nx * exp_const;

1177

1178

// Multiply by twiddle factor

1179

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1180

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1181

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1182

1183

// Compute DFT N = 4

1184

DFT_4(c0, c1, c2, c3);

1185

1186

// Store four complex output values

1187

vstore2(c0, 0, (__global float *)output.ptr);

1188

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1189

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1190

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

1191

}

1192

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1193

/** Computes a stage of a radix-4 FFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1194

*

1195

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1196

*

1197

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1198

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1199

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1200

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1201

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1202

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1203

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1204

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1205

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1206

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1207

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1208

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1209

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1210

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1211

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1212

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

1213

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1214

* @param[in] Ni Nx * Ny.

1215

* @param[in] exp_const Exponent constant

1216

*/

1217

kernel void fft_radix_4_axis_1(

1218

TENSOR3D_DECLARATION(input)

1219

#ifndef IN_PLACE

1220

,

1221

TENSOR3D_DECLARATION(output)

1222

#endif /* not IN_PLACE */

1223

,

1224

uint Nx, uint Ni, float exp_const)

1225

{

1226

// Each work-item computes a single radix-4

1227

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1234

1235

// Get tensor pointers

1236

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1237

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

1238

#ifdef IN_PLACE

1239

Tensor3D output = input;

1240

#else /* IN_PLACE */

1241

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1242

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

1243

#endif /* IN_PLACE */

1244

1245

// Load four complex input values

1246

float2 c0 = vload2(0, (__global float *)input.ptr);

1247

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

1248

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));

1249

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));

1250

1251

// Compute phi

1252

float phi = (float)nx * exp_const;

1253

1254

// Multiply by twiddle factor

1255

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1256

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1257

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1258

1259

// Compute DFT N = 4

1260

DFT_4(c0, c1, c2, c3);

1261

1262

// Store four complex output values

1263

vstore2(c0, 0, (__global float *)output.ptr);

1264

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

1265

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));

1266

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));

1267

}

1268

1269

/** Computes a stage of a radix-5 FFT on axis 0.

1270

*

1271

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1272

*

1273

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1274

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1275

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1276

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1277

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1278

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1279

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1280

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

1281

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1282

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1283

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1284

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1285

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1286

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1287

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1288

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1289

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1290

* @param[in] Ni Nx * Ny.

1291

* @param[in] exp_const Exponent constant

1292

*/

1293

kernel void fft_radix_5_axis_0(

1294

TENSOR3D_DECLARATION(input)

1295

#ifndef IN_PLACE

1296

,

1297

TENSOR3D_DECLARATION(output)

1298

#endif /* not IN_PLACE */

1299

,

1300

uint Nx, uint Ni, float exp_const)

1301

{

1302

// Each work-item computes a single radix-5

1303

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1310

1311

// Get tensor pointers

1312

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1313

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

1314

#ifdef IN_PLACE

1315

Tensor3D output = input;

1316

#else /* IN_PLACE */

1317

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1318

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

1319

#endif /* IN_PLACE */

1320

1321

// Load five complex input values

1322

float2 c0 = vload2(0, (__global float *)input.ptr);

1323

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

1324

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

1325

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

1326

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

1327

1328

// Compute phi

1329

float phi = (float)nx * exp_const;

1330

1331

// Multiply by twiddle factor

1332

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1333

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1334

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1335

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1336

1337

// Compute DFT N = 5

1338

DFT_5(c0, c1, c2, c3, c4);

1339

1340

// Store five complex output values

1341

vstore2(c0, 0, (__global float *)output.ptr);

1342

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1343

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1344

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

1345

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

1346

}

1347

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1348

/** Computes a stage of a radix-5 FFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1349

*

1350

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1351

*

1352

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1353

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1354

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1355

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1356

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1357

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1358

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1359

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1360

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1361

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1362

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1363

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1364

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1365

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1366

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1367

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

1368

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1369

* @param[in] Ni Nx * Ny.

1370

* @param[in] exp_const Exponent constant

1371

*/

1372

kernel void fft_radix_5_axis_1(

1373

TENSOR3D_DECLARATION(input)

1374

#ifndef IN_PLACE

1375

,

1376

TENSOR3D_DECLARATION(output)

1377

#endif /* not IN_PLACE */

1378

,

1379

uint Nx, uint Ni, float exp_const)

1380

{

1381

// Each work-item computes a single radix-5

1382

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1389

1390

// Get tensor pointers

1391

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1392

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

1393

#ifdef IN_PLACE

1394

Tensor3D output = input;

1395

#else /* IN_PLACE */

1396

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1397

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

1398

#endif /* IN_PLACE */

1399

1400

// Load five complex input values

1401

float2 c0 = vload2(0, (__global float *)input.ptr);

1402

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

1403

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));

1404

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));

1405

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));

1406

1407

// Compute phi

1408

float phi = (float)nx * exp_const;

1409

1410

// Multiply by twiddle factor

1411

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1412

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1413

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1414

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1415

1416

// Compute DFT N = 5

1417

DFT_5(c0, c1, c2, c3, c4);

1418

1419

// Store five complex output values

1420

vstore2(c0, 0, (__global float *)output.ptr);

1421

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

1422

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));

1423

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));

1424

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));

1425

}

1426

1427

/** Computes a stage of a radix-7 FFT on axis 0.

1428

*

1429

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1430

*

1431

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1432

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1433

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1434

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1435

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1436

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1437

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1438

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

1439

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1440

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1441

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1442

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1443

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1444

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1445

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1446

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1447

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1448

* @param[in] Ni Nx * Ny.

1449

* @param[in] exp_const Exponent constant

1450

*/

1451

kernel void fft_radix_7_axis_0(

1452

TENSOR3D_DECLARATION(input)

1453

#ifndef IN_PLACE

1454

,

1455

TENSOR3D_DECLARATION(output)

1456

#endif /* not IN_PLACE */

1457

,

1458

uint Nx, uint Ni, float exp_const)

1459

{

1460

// Each work-item computes a single radix-7

1461

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1468

1469

// Get tensor pointers

1470

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1471

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

1472

#ifdef IN_PLACE

1473

Tensor3D output = input;

1474

#else /* IN_PLACE */

1475

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1476

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

1477

#endif /* IN_PLACE */

1478

1479

// Load seven complex input values

1480

float2 c0 = vload2(0, (__global float *)input.ptr);

1481

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

1482

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

1483

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

1484

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

1485

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));

1486

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));

1487

1488

// Compute phi

1489

float phi = (float)nx * exp_const;

1490

1491

// Multiply by twiddle factor

1492

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1493

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1494

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1495

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1496

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

1497

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

1498

1499

// Compute DFT N = 7

1500

DFT_7(c0, c1, c2, c3, c4, c5, c6);

1501

1502

// Store seven complex output values

1503

vstore2(c0, 0, (__global float *)output.ptr);

1504

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1505

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1506

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

1507

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

1508

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));

1509

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));

1510

}

1511

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1512

/** Computes a stage of a radix-7 FFT on axis 1.

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1513

*

1514

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1515

*

1516

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1517

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1518

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1519

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1520

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1521

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1522

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1523

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1524

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1525

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1526

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1527

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1528

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1529

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1530

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1531

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

1532

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1533

* @param[in] Ni Nx * Ny.

1534

* @param[in] exp_const Exponent constant

1535

*/

1536

kernel void fft_radix_7_axis_1(

1537

TENSOR3D_DECLARATION(input)

1538

#ifndef IN_PLACE

1539

,

1540

TENSOR3D_DECLARATION(output)

1541

#endif /* not IN_PLACE */

1542

,

1543

uint Nx, uint Ni, float exp_const)

1544

{

1545

// Each work-item computes a single radix-7

1546

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1553

1554

// Get tensor pointers

1555

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1556

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

1557

#ifdef IN_PLACE

1558

Tensor3D output = input;

1559

#else /* IN_PLACE */

1560

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1561

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

1562

#endif /* IN_PLACE */

1563

1564

// Load seven complex input values

1565

float2 c0 = vload2(0, (__global float *)input.ptr);

1566

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

1567

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));

1568

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));

1569

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));

1570

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));

1571

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));

1572

1573

// Compute phi

1574

float phi = (float)nx * exp_const;

1575

1576

// Multiply by twiddle factor

1577

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1578

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1579

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1580

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1581

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

1582

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

1583

1584

// Compute DFT N = 7

1585

DFT_7(c0, c1, c2, c3, c4, c5, c6);

1586

1587

// Store seven complex output values

1588

vstore2(c0, 0, (__global float *)output.ptr);

1589

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

1590

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));

1591

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));

1592

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));

1593

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));

1594

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));

1595

}

1596

1597

/** Computes a stage of a radix-8 FFT on axis 0.

1598

*

1599

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1600

*

1601

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1602

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1603

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1604

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1605

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1606

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1607

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1608

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

1609

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1610

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1611

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1612

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1613

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1614

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1615

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1616

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1617

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1618

* @param[in] Ni Nx * Ny.

1619

* @param[in] exp_const Exponent constant

1620

*/

1621

kernel void fft_radix_8_axis_0(

1622

TENSOR3D_DECLARATION(input)

1623

#ifndef IN_PLACE

1624

,

1625

TENSOR3D_DECLARATION(output)

1626

#endif /* not IN_PLACE */

1627

,

1628

uint Nx, uint Ni, float exp_const)

1629

{

1630

// Each work-item computes a single radix-8

1631

uint kx = get_global_id(0);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1638

1639

// Get tensor pointers

1640

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1641

input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;

1642

#ifdef IN_PLACE

1643

Tensor3D output = input;

1644

#else /* IN_PLACE */

1645

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1646

output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;

1647

#endif /* IN_PLACE */

1648

1649

// Load eight complex input values

1650

float2 c0 = vload2(0, (__global float *)input.ptr);

1651

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));

1652

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));

1653

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));

1654

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));

1655

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));

1656

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));

1657

float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));

1658

1659

// Compute phi

1660

float phi = (float)nx * exp_const;

1661

1662

// Multiply by twiddle factor

1663

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1664

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1665

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1666

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1667

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

1668

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

1669

TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);

1670

1671

// Compute DFT N = 8

1672

DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);

1673

1674

// Store eight complex output values

1675

vstore2(c0, 0, (__global float *)output.ptr);

1676

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));

1677

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));

1678

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));

1679

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));

1680

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));

1681

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));

1682

vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));

Georgios Pinitas

8be9148

2019-03-26 17:23:28 +0000

[diff] [blame^]

1683

}

1684

1685

/** Computes a stage of a radix-8 FFT on axis 1.

1686

*

1687

* @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time

1688

*

1689

* @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32

1690

* @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)

1691

* @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

1692

* @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)

1693

* @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

1694

* @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)

1695

* @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

1696

* @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor

1697

* @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr

1698

* @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)

1699

* @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)

1700

* @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)

1701

* @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)

1702

* @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)

1703

* @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)

1704

* @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image

1705

* @param[in] Nx The butterfly span. Products of radix order of previous radix's stage

1706

* @param[in] Ni Nx * Ny.

1707

* @param[in] exp_const Exponent constant

1708

*/

1709

kernel void fft_radix_8_axis_1(

1710

TENSOR3D_DECLARATION(input)

1711

#ifndef IN_PLACE

1712

,

1713

TENSOR3D_DECLARATION(output)

1714

#endif /* not IN_PLACE */

1715

,

1716

uint Nx, uint Ni, float exp_const)

1717

{

1718

// Each work-item computes a single radix-8

1719

uint kx = get_global_id(1);

// Compute nx

uint nx = kx % Nx;

// Compute n index

uint n = nx + (kx / Nx) * Ni;

1726

1727

// Get tensor pointers

1728

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);

1729

input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;

1730

#ifdef IN_PLACE

1731

Tensor3D output = input;

1732

#else /* IN_PLACE */

1733

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);

1734

output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;

1735

#endif /* IN_PLACE */

1736

1737

// Load eight complex input values

1738

float2 c0 = vload2(0, (__global float *)input.ptr);

1739

float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));

1740

float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));

1741

float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));

1742

float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));

1743

float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));

1744

float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));

1745

float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7 * Nx, 0));

1746

1747

// Compute phi

1748

float phi = (float)nx * exp_const;

1749

1750

// Multiply by twiddle factor

1751

TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);

1752

TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);

1753

TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);

1754

TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);

1755

TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);

1756

TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);

1757

TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);

1758

1759

// Compute DFT N = 8

1760

DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);

1761

1762

// Store eight complex output values

1763

vstore2(c0, 0, (__global float *)output.ptr);

1764

vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));

1765

vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));

1766

vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));

1767

vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));

1768

vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));

1769

vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));

1770

vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 0, 7 * Nx, 0));

Georgios Pinitas

0bc7849

2019-03-18 20:07:37 +0000

[diff] [blame]

1771

}