Blame - src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl - ml/ComputeLibrary

2021-07-05 13:12:52 +0100

[diff] [blame]

31

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

32

*

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

33

* @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

34

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

35

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

36

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

37

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

38

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

39

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

40

*

41

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

42

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

43

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

44

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

45

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

46

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

47

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

48

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

49

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

50

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

51

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

52

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

53

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

54

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

55

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

56

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

57

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

58

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

59

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

60

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

61

* @param[in] _ISRC_HEIGHT The source tensor's height

62

* @param[in] _IDST_WIDTH The destination tensor's width

63

* @param[in] _IDST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

64

*/

65

__kernel void winograd_output_transform_2x2_7x7_nhwc(

66

TENSOR4D(src, BUFFER),

67

TENSOR4D(dst, BUFFER),

68

#if defined(HAS_BIAS)

69

VECTOR_DECLARATION(bias),

70

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

71

int dst_size,

72

const int _ISRC_HEIGHT,

73

const int _IDST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

74

const int _IDST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

75

{

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

76

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

77

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

78

#if defined(IS_BATCHED)

79

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

80

#else // defined(IS_BATCHED)

81

const int bout = 0; // BATCH SIZE IDX

82

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

83

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

84

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

85

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

86

87

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

88

TILE(DATA_TYPE, 8, N0, in);

89

TILE(DATA_TYPE, 2, N0, out);

90

TILE(uint, 8, 1, src_indirect_y);

91

92

// Calculate the indirect Y for the source tensor

93

LOOP_UNROLLING(int, i, 0, 1, 8,

94

{

95

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

96

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);

97

})

98

99

// Initialize the input tile

100

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// Load the values across the 8 channels to compose the 8x1 tile

106

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

107

108

// Compute out0 and out01

109

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

110

out[1].v = -in[1].v + in[2].v - (DATA_TYPE)2.f * in[3].v + (DATA_TYPE)2.0f * in[4].v - (DATA_TYPE)3.0f * in[5].v + (DATA_TYPE)3.0f * in[6].v + in[7].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

111

112

#if defined(HAS_BIAS)

113

// Add bias

114

TILE(DATA_TYPE, 1, N0, b);

115

116

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

117

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

118

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

119

#endif // defined(HAS_BIAS)

120

121

T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

122

123

TILE(uint, 2, 1, dst_indirect_y);

124

125

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

126

LOOP_UNROLLING(int, yk, 0, 1, 2,

127

{

128

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

129

dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);

130

})

131

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

132

LOOP_UNROLLING(int, xk, 0, 1, 2,

133

{

134

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

135

dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);

136

})

137

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

138

139

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

140

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

141

142

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

143

144

TILE(DATA_TYPE, 64, N0, in);

145

TILE(DATA_TYPE, 4, N0, out);

146

TILE(DATA_TYPE, 16, N0, tmp);

147

TILE(uint, 64, 1, src_indirect_y);

148

149

// Calculate the indirect Y for the source tensor

150

LOOP_UNROLLING(int, i, 0, 1, 64,

151

{

152

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

153

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);

154

})

155

156

// Initialize the input tile

157

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the values across the 64 channels to compose the 8x8 tile

163

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

164

165

LOOP_UNROLLING(int, i, 0, 1, 8,

166

{

167

tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

168

tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - (DATA_TYPE)2 * in[24 + i].v + (DATA_TYPE)2 * in[32 + i].v + (DATA_TYPE) - 3 * in[40 + i].v + (DATA_TYPE)3 * in[48 + i].v + in[56 + i].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

169

})

170

171

// Compute the 2x2 output tile

172

LOOP_UNROLLING(int, i, 0, 1, 2,

173

{

174

out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

175

out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - (DATA_TYPE)2 * tmp[6 + i].v + (DATA_TYPE)2 * tmp[8 + i].v - (DATA_TYPE)3 * tmp[10 + i].v + (DATA_TYPE)3 * tmp[12 + i].v + tmp[14 + i].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

176

})

177

178

#if defined(HAS_BIAS)

179

// Add bias

180

TILE(DATA_TYPE, 1, N0, b);

181

182

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

183

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

184

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

185

#endif // defined(HAS_BIAS)

186

187

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

188

189

TILE(uint, 4, 1, dst_indirect_y);

190

191

// Calculate the destination indirect Y

192

LOOP_UNROLLING(int, yk, 0, 1, 2,

193

{

194

LOOP_UNROLLING(int, xk, 0, 1, 2,

195

{

196

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

197

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

198

dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;

199

dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

204

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

205

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

206

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

207

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

208

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

209

210

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

211

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

212

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

213

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

214

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

215

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

216

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

217

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

218

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

219

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

220

*

221

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

222

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

223

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

224

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

225

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

226

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

227

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

228

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

229

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

230

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

231

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

232

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

233

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

234

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

235

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

236

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

237

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

238

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

239

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

240

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

241

* @param[in] dst_size Size of the destination tensor, minus the last padding

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

242

* @param[in] SRC_HEIGHT The source tensor's height

243

* @param[in] DST_WIDTH The destination tensor's width

244

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

245

*/

246

__kernel void winograd_output_transform_4x4_3x3_nhwc(

247

TENSOR4D(src, BUFFER),

248

TENSOR4D(dst, BUFFER),

249

#if defined(HAS_BIAS)

250

VECTOR_DECLARATION(bias),

251

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

252

int dst_size,

253

const int SRC_HEIGHT,

254

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

255

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

256

{

257

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

258

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

259

#if defined(IS_BATCHED)

260

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

261

#else // defined(IS_BATCHED)

262

const int bout = 0; // BATCH SIZE IDX

263

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

264

265

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

266

267

TILE(DATA_TYPE, 6, N0, in);

268

TILE(DATA_TYPE, 4, N0, out);

269

TILE(uint, 6, 1, src_indirect_y);

270

271

LOOP_UNROLLING(int, i, 0, 1, 6,

272

{

273

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

274

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);

275

})

276

277

// Initialize the input tile

278

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

284

T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

285

286

// Compute out00, out01, out02 and out03

287

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

288

out[1].v = in[1].v - in[2].v + (DATA_TYPE)2.0f * in[3].v - (DATA_TYPE)2.0f * in[4].v;

289

out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * in[3].v + (DATA_TYPE)4.0f * in[4].v;

290

out[3].v = in[1].v - in[2].v + (DATA_TYPE)8.0f * in[3].v - (DATA_TYPE)8.0f * in[4].v + in[5].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

291

292

#if defined(HAS_BIAS)

293

TILE(DATA_TYPE, 1, N0, b);

294

295

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

296

297

// c = c + bias[broadcasted]

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

298

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

299

#endif // HAS_BIAS

300

301

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

302

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

303

304

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

305

306

TILE(uint, 4, 1, dst_indirect_y);

307

308

// Calculate the destination indirect Y

309

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

310

LOOP_UNROLLING(int, yk, 0, 1, 4,

311

{

312

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

313

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

314

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

315

})

316

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

317

LOOP_UNROLLING(int, xk, 0, 1, 4,

318

{

319

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

320

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

321

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

322

})

323

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

324

325

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

326

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

327

328

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

329

330

// Calculate the indirect Y for the source tensor

331

TILE(DATA_TYPE, 36, N0, in);

332

TILE(DATA_TYPE, 4, N0, tmp);

333

TILE(uint, 36, 1, src_indirect_y);

334

335

LOOP_UNROLLING(int, i, 0, 1, 36,

336

{

337

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

338

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);

339

})

340

341

// Initialize the input tile

342

LOOP_UNROLLING(int, i, 0, 1, 36,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

348

T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

349

350

LOOP_UNROLLING(int, i, 0, 1, 6,

351

{

352

tmp[0].v = in[6 + i].v + in[12 + i].v;

353

tmp[1].v = in[6 + i].v - in[12 + i].v;

354

tmp[2].v = in[18 + i].v + in[24 + i].v;

355

tmp[3].v = in[18 + i].v - in[24 + i].v;

356

tmp[3].v = tmp[3].v + tmp[3].v;

357

in[i].v = in[i].v + tmp[0].v + tmp[2].v;

358

in[6 + i].v = tmp[3].v + tmp[1].v;

359

in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

360

in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;

361

})

362

363

// Compute the output tile

364

TILE(DATA_TYPE, 16, N0, out);

365

366

LOOP_UNROLLING(int, i, 0, 1, 4,

367

{

368

tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;

369

tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;

370

tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;

371

tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;

372

tmp[3].v = tmp[3].v + tmp[3].v;

373

out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;

374

out[4 * i + 1].v = tmp[3].v + tmp[1].v;

375

out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

376

out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;

377

})

378

379

#if defined(HAS_BIAS)

380

TILE(DATA_TYPE, 1, N0, b);

381

382

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

383

384

// c = c + bias[broadcasted]

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

385

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

386

#endif // HAS_BIAS

387

388

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

389

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

390

391

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

392

393

TILE(uint, 16, 1, dst_indirect_y);

394

395

// Calculate the destination indirect Y

396

LOOP_UNROLLING(int, yk, 0, 1, 4,

397

{

398

LOOP_UNROLLING(int, xk, 0, 1, 4,

399

{

400

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

401

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

402

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

403

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

408

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

409

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

410

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

411

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

412

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

413

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

414

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

415

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

416

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

417

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

418

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

419

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

420

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

421

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

422

*

423

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

424

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

425

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

427

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

429

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

431

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

432

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

433

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

434

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

435

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

436

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

437

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

438

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

439

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

440

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

441

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

442

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

443

* @param[in] SRC_HEIGHT The source tensor's height

444

* @param[in] DST_WIDTH The destination tensor's width

445

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

446

*/

447

__kernel void winograd_output_transform_4x4_5x5_nhwc(

448

TENSOR4D(src, BUFFER),

449

TENSOR4D(dst, BUFFER),

450

#if defined(HAS_BIAS)

451

VECTOR_DECLARATION(bias),

452

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

453

int dst_size,

454

const int SRC_HEIGHT,

455

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

456

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

457

{

458

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

459

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

460

#if defined(IS_BATCHED)

461

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

462

#else // defined(IS_BATCHED)

463

const int bout = 0; // BATCH SIZE IDX

464

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

465

466

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

467

TILE(DATA_TYPE, 8, N0, in);

468

TILE(DATA_TYPE, 4, N0, out);

469

TILE(DATA_TYPE, 4, N0, tmp);

470

TILE(uint, 8, 1, src_indirect_y);

471

472

LOOP_UNROLLING(int, i, 0, 1, 8,

473

{

474

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

475

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);

476

})

477

478

// Initialize the input tile

479

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// "in" contains 1x8 or 8x1 tile here

485

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

486

487

// A^T * in, and in this degenerate case out consists of 1 column/row

488

tmp[0].v = in[1].v - in[2].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

489

tmp[1].v = (DATA_TYPE)2.0f * (in[3].v - in[4].v);

490

tmp[2].v = (DATA_TYPE)2.0f * (in[5].v + in[6].v);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

491

tmp[3].v = in[3].v + in[4].v;

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

492

out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + (DATA_TYPE)4.0f * tmp[2].v;

493

out[1].v = tmp[0].v + tmp[1].v + (DATA_TYPE)4.0f * (in[5].v - in[6].v);

494

out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * tmp[3].v + tmp[2].v;

495

out[3].v = tmp[0].v + (DATA_TYPE)4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

496

497

#if defined(HAS_BIAS)

498

TILE(DATA_TYPE, 1, N0, b);

499

500

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

501

502

// c = c + bias[broadcasted]

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

503

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

504

#endif // HAS_BIAS

505

506

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

507

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

508

509

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

510

511

TILE(uint, 4, 1, dst_indirect_y);

512

513

// Calculate the destination indirect Y

514

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

515

LOOP_UNROLLING(int, yk, 0, 1, 4,

516

{

517

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

518

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

519

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

520

})

521

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

522

LOOP_UNROLLING(int, xk, 0, 1, 4,

523

{

524

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

525

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

526

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

527

})

528

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

529

530

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

531

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

532

533

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

534

// Calculate the indirect Y for the source tensor

535

TILE(DATA_TYPE, 64, N0, in);

536

TILE(DATA_TYPE, 6, N0, tmp);

537

TILE(uint, 64, 1, src_indirect_y);

538

539

LOOP_UNROLLING(int, i, 0, 1, 64,

540

{

541

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

542

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);

543

})

544

545

// Initialize the input tile

546

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// "in" here is 8x8 tile

552

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

553

554

// A^T * in

555

LOOP_UNROLLING(int, i, 0, 1, 8,

556

{

557

tmp[0].v = in[8 + i].v + in[16 + i].v;

558

tmp[1].v = in[8 + i].v - in[16 + i].v;

559

tmp[2].v = in[24 + i].v + in[32 + i].v;

560

tmp[3].v = in[24 + i].v - in[32 + i].v;

561

tmp[3].v = tmp[3].v + tmp[3].v;

562

tmp[4].v = in[40 + i].v + in[48 + i].v;

563

tmp[4].v = tmp[4].v + tmp[4].v;

564

tmp[5].v = in[40 + i].v - in[48 + i].v;

565

566

// 4x8 matrix as a result

567

in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

568

in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

569

in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);

570

in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;

571

})

572

573

// Compute the output tile

574

TILE(DATA_TYPE, 16, N0, out);

575

576

// in * A, with in = A^T * in as above

577

LOOP_UNROLLING(int, i, 0, 1, 4,

578

{

579

tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;

580

tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;

581

tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;

582

tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;

583

tmp[3].v = tmp[3].v + tmp[3].v;

584

tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;

585

tmp[4].v = tmp[4].v + tmp[4].v;

586

tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;

587

588

// 4x4 tile

589

out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

590

out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

591

out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;

592

out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;

593

})

594

595

#if defined(HAS_BIAS)

596

TILE(DATA_TYPE, 1, N0, b);

597

598

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

599

600

// c = c + bias[broadcasted]

Michalis Spyrou

2022-06-15 19:02:28 +0100

[diff] [blame]

601

T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

602

#endif // HAS_BIAS

603

604

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

605

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

606

607

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

608

609

TILE(uint, 16, 1, dst_indirect_y);

610

611

// Calculate the destination indirect Y

612

LOOP_UNROLLING(int, yk, 0, 1, 4,

613

{

614

LOOP_UNROLLING(int, xk, 0, 1, 4,

615

{

616

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

617

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

618

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

619

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

624

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

625

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

626

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

627

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

628

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

629

630

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

631

#if defined(VEC_SIZE) && VEC_SIZE == 2

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

632

#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

633

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

634

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

635

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

636

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

637

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

638

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

639

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

640

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

641

*

642

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

643

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

644

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

645

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

646

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

647

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

648

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

649

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

650

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

651

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

652

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

653

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

654

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

655

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

656

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

657

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

658

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

659

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

660

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

661

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

662

* @param[in] SRC_HEIGHT The source tensor's height

663

* @param[in] DST_WIDTH The destination tensor's width

664

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

665

*/

666

__kernel void winograd_output_transform_2x1_7x1_nhwc(

667

TENSOR4D_DECLARATION(src),

668

TENSOR4D_DECLARATION(dst),

669

#if defined(HAS_BIAS)

670

VECTOR_DECLARATION(bias),

671

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

672

int dst_size,

673

const int SRC_HEIGHT,

674

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

675

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

676

{

677

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

697

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

702

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

703

dst_size,

704

SRC_HEIGHT,

705

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

706

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

707

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

708

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

709

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

710

711

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

712

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

713

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

714

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

715

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

716

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

717

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

718

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

719

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

720

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

721

*

722

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

723

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

724

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

725

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

726

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

727

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

728

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

729

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

730

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

731

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

732

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

733

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

734

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

735

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

736

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

737

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

738

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

739

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

740

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

741

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

742

* @param[in] SRC_HEIGHT The source tensor's height

743

* @param[in] DST_WIDTH The destination tensor's width

744

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

745

*/

746

__kernel void winograd_output_transform_4x1_3x1_nhwc(

747

TENSOR4D_DECLARATION(src),

748

TENSOR4D_DECLARATION(dst),

749

#if defined(HAS_BIAS)

750

VECTOR_DECLARATION(bias),

751

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

752

int dst_size,

753

const int SRC_HEIGHT,

754

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

755

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

756

{

757

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

777

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

782

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

783

dst_size,

784

SRC_HEIGHT,

785

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

786

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

787

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

788

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

789

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

790

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

791

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

792

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

793

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

794

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

795

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

796

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

797

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

798

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

799

*

800

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

801

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

802

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

803

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

804

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

805

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

806

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

807

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

808

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

809

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

810

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

811

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

812

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

813

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

814

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

815

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

816

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

817

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

818

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

819

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

820

* @param[in] SRC_HEIGHT The source tensor's height

821

* @param[in] DST_WIDTH The destination tensor's width

822

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

823

*/

824

__kernel void winograd_output_transform_4x1_5x1_nhwc(

825

TENSOR4D_DECLARATION(src),

826

TENSOR4D_DECLARATION(dst),

827

#if defined(HAS_BIAS)

828

VECTOR_DECLARATION(bias),

829

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

830

int dst_size,

831

const int SRC_HEIGHT,

832

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

833

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

834

{

835

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

855

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

860

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

861

dst_size,

862

SRC_HEIGHT,

863

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

864

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

865

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

866

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

867

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

868

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

869

870

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

871

#if defined(VEC_SIZE) && VEC_SIZE == 2

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

872

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

873

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

874

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

875

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

876

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

877

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

878

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

879

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

880

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

881

*

882

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

883

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

884

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

885

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

886

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

887

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

888

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

889

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

890

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

891

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

892

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

893

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

894

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

895

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

896

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

897

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

898

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

899

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

900

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

901

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

902

* @param[in] SRC_HEIGHT The source tensor's height

903

* @param[in] DST_WIDTH The destination tensor's width

904

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

905

*/

906

__kernel void winograd_output_transform_1x2_1x7_nhwc(

907

TENSOR4D_DECLARATION(src),

908

TENSOR4D_DECLARATION(dst),

909

#if defined(HAS_BIAS)

910

VECTOR_DECLARATION(bias),

911

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

912

int dst_size,

913

const int SRC_HEIGHT,

914

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

915

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

916

{

917

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

937

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

942

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

943

dst_size,

944

SRC_HEIGHT,

945

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

946

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

947

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

948

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

949

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

950

951

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

952

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

953

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

954

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

955

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

956

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

957

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

958

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

959

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

960

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

961

*

962

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

963

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

964

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

965

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

966

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

967

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

968

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

969

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

970

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

971

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

972

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

973

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

974

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

975

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

976

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

977

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

978

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

979

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

980

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

981

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

982

* @param[in] SRC_HEIGHT The source tensor's height

983

* @param[in] DST_WIDTH The destination tensor's width

984

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

985

*/

986

__kernel void winograd_output_transform_1x4_1x3_nhwc(

987

TENSOR4D_DECLARATION(src),

988

TENSOR4D_DECLARATION(dst),

989

#if defined(HAS_BIAS)

990

VECTOR_DECLARATION(bias),

991

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

992

int dst_size,

993

const int SRC_HEIGHT,

994

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

995

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

996

{

997

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1017

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1022

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1023

dst_size,

1024

SRC_HEIGHT,

1025

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1026

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1027

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1028

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1029

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1030

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1031

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1032

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1033

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1034

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1035

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1036

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

1037

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1038

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1039

*

1040

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1041

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1042

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1044

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1046

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1047

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1048

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1049

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1050

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1051

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1052

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1053

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1054

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1055

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1056

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1057

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1058

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1059

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1060

* @param[in] SRC_HEIGHT The source tensor's height

1061

* @param[in] DST_WIDTH The destination tensor's width

1062

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1063

*/

1064

__kernel void winograd_output_transform_1x4_1x5_nhwc(

1065

TENSOR4D_DECLARATION(src),

1066

TENSOR4D_DECLARATION(dst),

1067

#if defined(HAS_BIAS)

1068

VECTOR_DECLARATION(bias),

1069

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1070

int dst_size,

1071

const int SRC_HEIGHT,

1072

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1073

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1074

{

1075

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1095

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1100

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1101

dst_size,

1102

SRC_HEIGHT,

1103

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1104

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1105

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1106

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1107

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

1108

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice