Blame - src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl - ml/ComputeLibrary

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

31

*

32

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

33

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

34

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

35

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

36

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

37

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

38

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

39

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

40

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

41

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

42

*

43

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

44

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

45

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

46

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

47

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

48

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

49

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

50

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

51

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

52

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

53

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

54

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

55

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

56

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

57

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

58

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

59

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

60

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

61

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

62

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

63

*/

64

__kernel void winograd_output_transform_2x2_7x7_nhwc(

65

TENSOR4D(src, BUFFER),

66

TENSOR4D(dst, BUFFER),

67

#if defined(HAS_BIAS)

68

VECTOR_DECLARATION(bias),

69

#endif // defined(HAS_BIAS)

70

int dst_size)

71

{

72

#define _ISRC_HEIGHT SRC_HEIGHT

73

#define _IDST_WIDTH DST_WIDTH

74

#define _IDST_HEIGHT DST_HEIGHT

75

#define _INUM_TILES_X NUM_TILES_X

76

77

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

78

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

79

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

80

81

int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

82

int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

83

84

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

85

TILE(DATA_TYPE, 8, N0, in);

86

TILE(DATA_TYPE, 2, N0, out);

87

TILE(uint, 8, 1, src_indirect_y);

88

89

// Calculate the indirect Y for the source tensor

90

LOOP_UNROLLING(int, i, 0, 1, 8,

91

{

92

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

93

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);

94

})

95

96

// Initialize the input tile

97

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// Load the values across the 8 channels to compose the 8x1 tile

103

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

104

105

// Compute out0 and out01

106

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;

107

out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;

108

109

#if defined(HAS_BIAS)

110

// Add bias

111

TILE(DATA_TYPE, 1, N0, b);

112

113

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

114

115

T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);

116

#endif // defined(HAS_BIAS)

117

118

T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

119

120

TILE(uint, 2, 1, dst_indirect_y);

121

122

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

123

LOOP_UNROLLING(int, yk, 0, 1, 2,

124

{

125

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

126

dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);

127

})

128

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

129

LOOP_UNROLLING(int, xk, 0, 1, 2,

130

{

131

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

132

dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);

133

})

134

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

135

136

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

137

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

138

139

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

140

141

TILE(DATA_TYPE, 64, N0, in);

142

TILE(DATA_TYPE, 4, N0, out);

143

TILE(DATA_TYPE, 16, N0, tmp);

144

TILE(uint, 64, 1, src_indirect_y);

145

146

// Calculate the indirect Y for the source tensor

147

LOOP_UNROLLING(int, i, 0, 1, 64,

148

{

149

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

150

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);

151

})

152

153

// Initialize the input tile

154

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the values across the 64 channels to compose the 8x8 tile

160

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

161

162

LOOP_UNROLLING(int, i, 0, 1, 8,

163

{

164

tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;

165

tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;

166

})

167

168

// Compute the 2x2 output tile

169

LOOP_UNROLLING(int, i, 0, 1, 2,

170

{

171

out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;

172

out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;

173

})

174

175

#if defined(HAS_BIAS)

176

// Add bias

177

TILE(DATA_TYPE, 1, N0, b);

178

179

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

180

181

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

182

#endif // defined(HAS_BIAS)

183

184

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

185

186

TILE(uint, 4, 1, dst_indirect_y);

187

188

// Calculate the destination indirect Y

189

LOOP_UNROLLING(int, yk, 0, 1, 2,

190

{

191

LOOP_UNROLLING(int, xk, 0, 1, 2,

192

{

193

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

194

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

195

dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;

196

dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

201

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

202

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

203

}

204

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

205

206

#if defined(VEC_SIZE) && VEC_SIZE == 4

207

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

208

*

209

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

210

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

211

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

212

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

213

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

214

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

215

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

216

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

217

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

218

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

219

*

220

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

221

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

222

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

223

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

224

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

225

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

226

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

227

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

228

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

229

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

230

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

231

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

232

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

233

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

234

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

235

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

236

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

237

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

238

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

239

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

240

* @param[in] dst_size Size of the destination tensor, minus the last padding

241

*/

242

__kernel void winograd_output_transform_4x4_3x3_nhwc(

243

TENSOR4D(src, BUFFER),

244

TENSOR4D(dst, BUFFER),

245

#if defined(HAS_BIAS)

246

VECTOR_DECLARATION(bias),

247

#endif // defined(HAS_BIAS)

248

int dst_size)

249

{

250

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

251

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

252

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

253

254

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

255

256

TILE(DATA_TYPE, 6, N0, in);

257

TILE(DATA_TYPE, 4, N0, out);

258

TILE(uint, 6, 1, src_indirect_y);

259

260

LOOP_UNROLLING(int, i, 0, 1, 6,

261

{

262

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

263

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);

264

})

265

266

// Initialize the input tile

267

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

273

T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

274

275

// Compute out00, out01, out02 and out03

276

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;

277

out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;

278

out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;

279

out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;

280

281

#if defined(HAS_BIAS)

282

TILE(DATA_TYPE, 1, N0, b);

283

284

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

285

286

// c = c + bias[broadcasted]

287

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

288

#endif // HAS_BIAS

289

290

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

291

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

292

293

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

294

295

TILE(uint, 4, 1, dst_indirect_y);

296

297

// Calculate the destination indirect Y

298

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

299

LOOP_UNROLLING(int, yk, 0, 1, 4,

300

{

301

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

302

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

303

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

304

})

305

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

306

LOOP_UNROLLING(int, xk, 0, 1, 4,

307

{

308

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

309

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

310

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

311

})

312

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

313

314

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

315

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

316

317

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

318

319

// Calculate the indirect Y for the source tensor

320

TILE(DATA_TYPE, 36, N0, in);

321

TILE(DATA_TYPE, 4, N0, tmp);

322

TILE(uint, 36, 1, src_indirect_y);

323

324

LOOP_UNROLLING(int, i, 0, 1, 36,

325

{

326

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

327

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);

328

})

329

330

// Initialize the input tile

331

LOOP_UNROLLING(int, i, 0, 1, 36,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

337

T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

338

339

LOOP_UNROLLING(int, i, 0, 1, 6,

340

{

341

tmp[0].v = in[6 + i].v + in[12 + i].v;

342

tmp[1].v = in[6 + i].v - in[12 + i].v;

343

tmp[2].v = in[18 + i].v + in[24 + i].v;

344

tmp[3].v = in[18 + i].v - in[24 + i].v;

345

tmp[3].v = tmp[3].v + tmp[3].v;

346

in[i].v = in[i].v + tmp[0].v + tmp[2].v;

347

in[6 + i].v = tmp[3].v + tmp[1].v;

348

in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

349

in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;

350

})

351

352

// Compute the output tile

353

TILE(DATA_TYPE, 16, N0, out);

354

355

LOOP_UNROLLING(int, i, 0, 1, 4,

356

{

357

tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;

358

tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;

359

tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;

360

tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;

361

tmp[3].v = tmp[3].v + tmp[3].v;

362

out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;

363

out[4 * i + 1].v = tmp[3].v + tmp[1].v;

364

out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

365

out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;

366

})

367

368

#if defined(HAS_BIAS)

369

TILE(DATA_TYPE, 1, N0, b);

370

371

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

372

373

// c = c + bias[broadcasted]

374

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

375

#endif // HAS_BIAS

376

377

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

378

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

379

380

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

381

382

TILE(uint, 16, 1, dst_indirect_y);

383

384

// Calculate the destination indirect Y

385

LOOP_UNROLLING(int, yk, 0, 1, 4,

386

{

387

LOOP_UNROLLING(int, xk, 0, 1, 4,

388

{

389

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

390

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

391

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

392

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

397

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

398

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

399

}

400

401

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

402

*

403

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

404

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

405

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32

407

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

408

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

409

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

410

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

411

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

412

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

413

*

414

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

415

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

416

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

417

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

418

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

419

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

420

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

421

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

422

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

423

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

424

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

425

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

426

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

427

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

428

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

429

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

430

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

431

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

432

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

433

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

434

*/

435

__kernel void winograd_output_transform_4x4_5x5_nhwc(

436

TENSOR4D(src, BUFFER),

437

TENSOR4D(dst, BUFFER),

438

#if defined(HAS_BIAS)

439

VECTOR_DECLARATION(bias),

440

#endif // defined(HAS_BIAS)

441

int dst_size)

442

{

443

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

444

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

445

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

446

447

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

448

TILE(DATA_TYPE, 8, N0, in);

449

TILE(DATA_TYPE, 4, N0, out);

450

TILE(DATA_TYPE, 4, N0, tmp);

451

TILE(uint, 8, 1, src_indirect_y);

452

453

LOOP_UNROLLING(int, i, 0, 1, 8,

454

{

455

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

456

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);

457

})

458

459

// Initialize the input tile

460

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// "in" contains 1x8 or 8x1 tile here

466

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

467

468

// A^T * in, and in this degenerate case out consists of 1 column/row

469

tmp[0].v = in[1].v - in[2].v;

470

tmp[1].v = 2.0f * (in[3].v - in[4].v);

471

tmp[2].v = 2.0f * (in[5].v + in[6].v);

472

tmp[3].v = in[3].v + in[4].v;

473

out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;

474

out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);

475

out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;

476

out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;

477

478

#if defined(HAS_BIAS)

479

TILE(DATA_TYPE, 1, N0, b);

480

481

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

482

483

// c = c + bias[broadcasted]

484

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

485

#endif // HAS_BIAS

486

487

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

488

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

489

490

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

491

492

TILE(uint, 4, 1, dst_indirect_y);

493

494

// Calculate the destination indirect Y

495

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

496

LOOP_UNROLLING(int, yk, 0, 1, 4,

497

{

498

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

499

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

500

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

501

})

502

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

503

LOOP_UNROLLING(int, xk, 0, 1, 4,

504

{

505

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

506

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

507

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

508

})

509

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

510

511

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

512

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

513

514

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

515

// Calculate the indirect Y for the source tensor

516

TILE(DATA_TYPE, 64, N0, in);

517

TILE(DATA_TYPE, 6, N0, tmp);

518

TILE(uint, 64, 1, src_indirect_y);

519

520

LOOP_UNROLLING(int, i, 0, 1, 64,

521

{

522

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

523

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);

524

})

525

526

// Initialize the input tile

527

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// "in" here is 8x8 tile

533

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

534

535

// A^T * in

536

LOOP_UNROLLING(int, i, 0, 1, 8,

537

{

538

tmp[0].v = in[8 + i].v + in[16 + i].v;

539

tmp[1].v = in[8 + i].v - in[16 + i].v;

540

tmp[2].v = in[24 + i].v + in[32 + i].v;

541

tmp[3].v = in[24 + i].v - in[32 + i].v;

542

tmp[3].v = tmp[3].v + tmp[3].v;

543

tmp[4].v = in[40 + i].v + in[48 + i].v;

544

tmp[4].v = tmp[4].v + tmp[4].v;

545

tmp[5].v = in[40 + i].v - in[48 + i].v;

546

547

// 4x8 matrix as a result

548

in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

549

in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

550

in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);

551

in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;

552

})

553

554

// Compute the output tile

555

TILE(DATA_TYPE, 16, N0, out);

556

557

// in * A, with in = A^T * in as above

558

LOOP_UNROLLING(int, i, 0, 1, 4,

559

{

560

tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;

561

tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;

562

tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;

563

tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;

564

tmp[3].v = tmp[3].v + tmp[3].v;

565

tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;

566

tmp[4].v = tmp[4].v + tmp[4].v;

567

tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;

568

569

// 4x4 tile

570

out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

571

out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

572

out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;

573

out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;

574

})

575

576

#if defined(HAS_BIAS)

577

TILE(DATA_TYPE, 1, N0, b);

578

579

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

580

581

// c = c + bias[broadcasted]

582

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

583

#endif // HAS_BIAS

584

585

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

586

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

587

588

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

589

590

TILE(uint, 16, 1, dst_indirect_y);

591

592

// Calculate the destination indirect Y

593

LOOP_UNROLLING(int, yk, 0, 1, 4,

594

{

595

LOOP_UNROLLING(int, xk, 0, 1, 4,

596

{

597

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

598

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

599

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

600

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

605

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

606

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

607

}

608

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

609

610

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

611

#if defined(VEC_SIZE) && VEC_SIZE == 2

612

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

613

*

614

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

615

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

616

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

617

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

618

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

619

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

620

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

621

*

622

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

623

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

624

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

625

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

626

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

627

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

628

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

629

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

630

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

631

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

632

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

633

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

634

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

635

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

636

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

637

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

638

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

639

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

640

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

641

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

642

*/

643

__kernel void winograd_output_transform_2x1_7x1_nhwc(

644

TENSOR4D_DECLARATION(src),

645

TENSOR4D_DECLARATION(dst),

646

#if defined(HAS_BIAS)

647

VECTOR_DECLARATION(bias),

648

#endif // defined(HAS_BIAS)

649

int dst_size)

650

{

651

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

671

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

676

#endif // defined(HAS_BIAS)

677

dst_size);

678

}

679

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

680

681

#if defined(VEC_SIZE) && VEC_SIZE == 4

682

683

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

684

*

685

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

686

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

687

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

688

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

689

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

690

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

691

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

692

*

693

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

694

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

695

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

696

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

697

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

698

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

699

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

700

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

701

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

702

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

703

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

704

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

705

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

706

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

707

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

708

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

709

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

710

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

711

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

712

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

713

*/

714

__kernel void winograd_output_transform_4x1_3x1_nhwc(

715

TENSOR4D_DECLARATION(src),

716

TENSOR4D_DECLARATION(dst),

717

#if defined(HAS_BIAS)

718

VECTOR_DECLARATION(bias),

719

#endif // defined(HAS_BIAS)

720

int dst_size)

721

{

722

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

742

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

747

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

752

*

753

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

754

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

755

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

756

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

757

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

758

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

759

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

760

*

761

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

762

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

763

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

764

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

765

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

766

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

767

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

768

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

769

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

770

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

771

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

772

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

773

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

774

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

775

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

776

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

777

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

778

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

779

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

780

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

781

*/

782

__kernel void winograd_output_transform_4x1_5x1_nhwc(

783

TENSOR4D_DECLARATION(src),

784

TENSOR4D_DECLARATION(dst),

785

#if defined(HAS_BIAS)

786

VECTOR_DECLARATION(bias),

787

#endif // defined(HAS_BIAS)

788

int dst_size)

789

{

790

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

810

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

815

#endif // defined(HAS_BIAS)

816

dst_size);

817

}

818

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

819

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

820

821

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

822

#if defined(VEC_SIZE) && VEC_SIZE == 2

823

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

824

*

825

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

826

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

827

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

828

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

829

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

830

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

831

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

832

*

833

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

834

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

835

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

836

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

837

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

838

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

839

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

840

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

841

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

842

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

843

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

844

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

845

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

846

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

847

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

848

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

849

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

850

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

851

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

852

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

853

*/

854

__kernel void winograd_output_transform_1x2_1x7_nhwc(

855

TENSOR4D_DECLARATION(src),

856

TENSOR4D_DECLARATION(dst),

857

#if defined(HAS_BIAS)

858

VECTOR_DECLARATION(bias),

859

#endif // defined(HAS_BIAS)

860

int dst_size)

861

{

862

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

882

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

887

#endif // defined(HAS_BIAS)

888

dst_size);

889

}

890

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

891

892

#if defined(VEC_SIZE) && VEC_SIZE == 4

893

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

894

*

895

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

896

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

897

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

898

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

899

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

900

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

901

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

902

*

903

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

904

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

905

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

906

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

907

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

908

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

909

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

910

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

911

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

912

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

913

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

914

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

915

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

916

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

917

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

918

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

919

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

920

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

921

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

922

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

923

*/

924

__kernel void winograd_output_transform_1x4_1x3_nhwc(

925

TENSOR4D_DECLARATION(src),

926

TENSOR4D_DECLARATION(dst),

927

#if defined(HAS_BIAS)

928

VECTOR_DECLARATION(bias),

929

#endif // defined(HAS_BIAS)

930

int dst_size)

931

{

932

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

952

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

957

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

962

*

963

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

964

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

965

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

966

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

967

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

968

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

969

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

970

*

971

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

972

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

973

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

974

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

975

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

976

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

977

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

978

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

979

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

980

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

981

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

982

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

983

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

984

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

985

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

986

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

987

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

988

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

989

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

990

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

991

*/

992

__kernel void winograd_output_transform_1x4_1x5_nhwc(

993

TENSOR4D_DECLARATION(src),

994

TENSOR4D_DECLARATION(dst),

995

#if defined(HAS_BIAS)

996

VECTOR_DECLARATION(bias),

997

#endif // defined(HAS_BIAS)

998

int dst_size)

999

{

1000

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1020

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1025

#endif // defined(HAS_BIAS)

1026

dst_size);

1027

}

1028

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

1029

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1030

#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)