Blame - src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl - ml/ComputeLibrary

2021-07-05 13:12:52 +0100

[diff] [blame]

31

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC

32

*

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

33

* @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

34

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

35

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

36

* @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

37

* @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

38

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

39

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

40

*

41

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

42

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

43

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

44

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

45

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

46

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

47

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

48

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

49

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

50

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

51

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

52

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

53

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

54

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

55

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

56

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

57

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

58

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

59

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

60

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

61

* @param[in] _ISRC_HEIGHT The source tensor's height

62

* @param[in] _IDST_WIDTH The destination tensor's width

63

* @param[in] _IDST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

64

*/

65

__kernel void winograd_output_transform_2x2_7x7_nhwc(

66

TENSOR4D(src, BUFFER),

67

TENSOR4D(dst, BUFFER),

68

#if defined(HAS_BIAS)

69

VECTOR_DECLARATION(bias),

70

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

71

int dst_size,

72

const int _ISRC_HEIGHT,

73

const int _IDST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

74

const int _IDST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

75

{

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

76

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

77

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

78

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

79

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

80

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

81

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

82

83

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

84

TILE(DATA_TYPE, 8, N0, in);

85

TILE(DATA_TYPE, 2, N0, out);

86

TILE(uint, 8, 1, src_indirect_y);

87

88

// Calculate the indirect Y for the source tensor

89

LOOP_UNROLLING(int, i, 0, 1, 8,

90

{

91

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

92

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);

93

})

94

95

// Initialize the input tile

96

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// Load the values across the 8 channels to compose the 8x1 tile

102

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

103

104

// Compute out0 and out01

105

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;

106

out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;

107

108

#if defined(HAS_BIAS)

109

// Add bias

110

TILE(DATA_TYPE, 1, N0, b);

111

112

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

113

114

T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);

115

#endif // defined(HAS_BIAS)

116

117

T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

118

119

TILE(uint, 2, 1, dst_indirect_y);

120

121

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

122

LOOP_UNROLLING(int, yk, 0, 1, 2,

123

{

124

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

125

dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);

126

})

127

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

128

LOOP_UNROLLING(int, xk, 0, 1, 2,

129

{

130

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

131

dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);

132

})

133

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

134

135

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

136

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

137

138

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

139

140

TILE(DATA_TYPE, 64, N0, in);

141

TILE(DATA_TYPE, 4, N0, out);

142

TILE(DATA_TYPE, 16, N0, tmp);

143

TILE(uint, 64, 1, src_indirect_y);

144

145

// Calculate the indirect Y for the source tensor

146

LOOP_UNROLLING(int, i, 0, 1, 64,

147

{

148

src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;

149

src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);

150

})

151

152

// Initialize the input tile

153

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the values across the 64 channels to compose the 8x8 tile

159

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

160

161

LOOP_UNROLLING(int, i, 0, 1, 8,

162

{

163

tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;

164

tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;

165

})

166

167

// Compute the 2x2 output tile

168

LOOP_UNROLLING(int, i, 0, 1, 2,

169

{

170

out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;

171

out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;

172

})

173

174

#if defined(HAS_BIAS)

175

// Add bias

176

TILE(DATA_TYPE, 1, N0, b);

177

178

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

179

180

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

181

#endif // defined(HAS_BIAS)

182

183

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

184

185

TILE(uint, 4, 1, dst_indirect_y);

186

187

// Calculate the destination indirect Y

188

LOOP_UNROLLING(int, yk, 0, 1, 2,

189

{

190

LOOP_UNROLLING(int, xk, 0, 1, 2,

191

{

192

int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));

193

int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));

194

dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;

195

dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

200

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

201

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

202

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

203

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

204

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

205

206

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

207

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

208

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

209

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

210

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

211

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

212

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

213

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

214

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

215

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

216

*

217

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

218

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

219

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

220

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

221

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

222

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

223

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

224

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

225

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

226

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

227

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

228

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

229

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

230

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

231

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

232

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

233

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

234

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

235

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

236

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

237

* @param[in] dst_size Size of the destination tensor, minus the last padding

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

238

* @param[in] SRC_HEIGHT The source tensor's height

239

* @param[in] DST_WIDTH The destination tensor's width

240

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

241

*/

242

__kernel void winograd_output_transform_4x4_3x3_nhwc(

243

TENSOR4D(src, BUFFER),

244

TENSOR4D(dst, BUFFER),

245

#if defined(HAS_BIAS)

246

VECTOR_DECLARATION(bias),

247

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

248

int dst_size,

249

const int SRC_HEIGHT,

250

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

251

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

252

{

253

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

254

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

255

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

256

257

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

258

259

TILE(DATA_TYPE, 6, N0, in);

260

TILE(DATA_TYPE, 4, N0, out);

261

TILE(uint, 6, 1, src_indirect_y);

262

263

LOOP_UNROLLING(int, i, 0, 1, 6,

264

{

265

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

266

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);

267

})

268

269

// Initialize the input tile

270

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

276

T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

277

278

// Compute out00, out01, out02 and out03

279

out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;

280

out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;

281

out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;

282

out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;

283

284

#if defined(HAS_BIAS)

285

TILE(DATA_TYPE, 1, N0, b);

286

287

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

288

289

// c = c + bias[broadcasted]

290

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

291

#endif // HAS_BIAS

292

293

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

294

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

295

296

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

297

298

TILE(uint, 4, 1, dst_indirect_y);

299

300

// Calculate the destination indirect Y

301

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

302

LOOP_UNROLLING(int, yk, 0, 1, 4,

303

{

304

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

305

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

306

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

307

})

308

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

309

LOOP_UNROLLING(int, xk, 0, 1, 4,

310

{

311

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

312

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

313

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

314

})

315

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

316

317

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

318

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

319

320

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

321

322

// Calculate the indirect Y for the source tensor

323

TILE(DATA_TYPE, 36, N0, in);

324

TILE(DATA_TYPE, 4, N0, tmp);

325

TILE(uint, 36, 1, src_indirect_y);

326

327

LOOP_UNROLLING(int, i, 0, 1, 36,

328

{

329

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

330

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);

331

})

332

333

// Initialize the input tile

334

LOOP_UNROLLING(int, i, 0, 1, 36,

{

in[i].v = 0;

})

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

340

T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

341

342

LOOP_UNROLLING(int, i, 0, 1, 6,

343

{

344

tmp[0].v = in[6 + i].v + in[12 + i].v;

345

tmp[1].v = in[6 + i].v - in[12 + i].v;

346

tmp[2].v = in[18 + i].v + in[24 + i].v;

347

tmp[3].v = in[18 + i].v - in[24 + i].v;

348

tmp[3].v = tmp[3].v + tmp[3].v;

349

in[i].v = in[i].v + tmp[0].v + tmp[2].v;

350

in[6 + i].v = tmp[3].v + tmp[1].v;

351

in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

352

in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;

353

})

354

355

// Compute the output tile

356

TILE(DATA_TYPE, 16, N0, out);

357

358

LOOP_UNROLLING(int, i, 0, 1, 4,

359

{

360

tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;

361

tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;

362

tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;

363

tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;

364

tmp[3].v = tmp[3].v + tmp[3].v;

365

out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;

366

out[4 * i + 1].v = tmp[3].v + tmp[1].v;

367

out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);

368

out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;

369

})

370

371

#if defined(HAS_BIAS)

372

TILE(DATA_TYPE, 1, N0, b);

373

374

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

375

376

// c = c + bias[broadcasted]

377

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

378

#endif // HAS_BIAS

379

380

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

381

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

382

383

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

384

385

TILE(uint, 16, 1, dst_indirect_y);

386

387

// Calculate the destination indirect Y

388

LOOP_UNROLLING(int, yk, 0, 1, 4,

389

{

390

LOOP_UNROLLING(int, xk, 0, 1, 4,

391

{

392

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

393

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

394

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

395

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

400

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

401

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

402

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

403

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

404

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

405

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

406

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

407

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

408

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

409

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

410

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

411

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

412

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

413

* @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1

414

*

415

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

416

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

423

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

424

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

425

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

426

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

427

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

428

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

429

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

430

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

431

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

432

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

433

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

434

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

435

* @param[in] SRC_HEIGHT The source tensor's height

436

* @param[in] DST_WIDTH The destination tensor's width

437

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

438

*/

439

__kernel void winograd_output_transform_4x4_5x5_nhwc(

440

TENSOR4D(src, BUFFER),

441

TENSOR4D(dst, BUFFER),

442

#if defined(HAS_BIAS)

443

VECTOR_DECLARATION(bias),

444

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

445

int dst_size,

446

const int SRC_HEIGHT,

447

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

448

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

449

{

450

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

451

const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES

452

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

453

454

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

455

TILE(DATA_TYPE, 8, N0, in);

456

TILE(DATA_TYPE, 4, N0, out);

457

TILE(DATA_TYPE, 4, N0, tmp);

458

TILE(uint, 8, 1, src_indirect_y);

459

460

LOOP_UNROLLING(int, i, 0, 1, 8,

461

{

462

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

463

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);

464

})

465

466

// Initialize the input tile

467

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

// "in" contains 1x8 or 8x1 tile here

473

T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

474

475

// A^T * in, and in this degenerate case out consists of 1 column/row

476

tmp[0].v = in[1].v - in[2].v;

477

tmp[1].v = 2.0f * (in[3].v - in[4].v);

478

tmp[2].v = 2.0f * (in[5].v + in[6].v);

479

tmp[3].v = in[3].v + in[4].v;

480

out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;

481

out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);

482

out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;

483

out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;

484

485

#if defined(HAS_BIAS)

486

TILE(DATA_TYPE, 1, N0, b);

487

488

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

489

490

// c = c + bias[broadcasted]

491

T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);

492

#endif // HAS_BIAS

493

494

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

495

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

496

497

T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

498

499

TILE(uint, 4, 1, dst_indirect_y);

500

501

// Calculate the destination indirect Y

502

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

503

LOOP_UNROLLING(int, yk, 0, 1, 4,

504

{

505

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

506

dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;

507

dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

508

})

509

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

510

LOOP_UNROLLING(int, xk, 0, 1, 4,

511

{

512

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

513

dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;

514

dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

515

})

516

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

517

518

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

519

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

520

521

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

522

// Calculate the indirect Y for the source tensor

523

TILE(DATA_TYPE, 64, N0, in);

524

TILE(DATA_TYPE, 6, N0, tmp);

525

TILE(uint, 64, 1, src_indirect_y);

526

527

LOOP_UNROLLING(int, i, 0, 1, 64,

528

{

529

src_indirect_y[i].v = mout + i *SRC_HEIGHT;

530

src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);

531

})

532

533

// Initialize the input tile

534

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// "in" here is 8x8 tile

540

T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);

541

542

// A^T * in

543

LOOP_UNROLLING(int, i, 0, 1, 8,

544

{

545

tmp[0].v = in[8 + i].v + in[16 + i].v;

546

tmp[1].v = in[8 + i].v - in[16 + i].v;

547

tmp[2].v = in[24 + i].v + in[32 + i].v;

548

tmp[3].v = in[24 + i].v - in[32 + i].v;

549

tmp[3].v = tmp[3].v + tmp[3].v;

550

tmp[4].v = in[40 + i].v + in[48 + i].v;

551

tmp[4].v = tmp[4].v + tmp[4].v;

552

tmp[5].v = in[40 + i].v - in[48 + i].v;

553

554

// 4x8 matrix as a result

555

in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

556

in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

557

in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);

558

in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;

559

})

560

561

// Compute the output tile

562

TILE(DATA_TYPE, 16, N0, out);

563

564

// in * A, with in = A^T * in as above

565

LOOP_UNROLLING(int, i, 0, 1, 4,

566

{

567

tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;

568

tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;

569

tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;

570

tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;

571

tmp[3].v = tmp[3].v + tmp[3].v;

572

tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;

573

tmp[4].v = tmp[4].v + tmp[4].v;

574

tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;

575

576

// 4x4 tile

577

out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);

578

out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);

579

out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;

580

out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;

581

})

582

583

#if defined(HAS_BIAS)

584

TILE(DATA_TYPE, 1, N0, b);

585

586

T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);

587

588

// c = c + bias[broadcasted]

589

T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);

590

#endif // HAS_BIAS

591

592

int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

593

int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;

594

595

T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);

596

597

TILE(uint, 16, 1, dst_indirect_y);

598

599

// Calculate the destination indirect Y

600

LOOP_UNROLLING(int, yk, 0, 1, 4,

601

{

602

LOOP_UNROLLING(int, xk, 0, 1, 4,

603

{

604

int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));

605

int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));

606

dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;

607

dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);

})

})

// Store the tile in reverse order so the invalid values are overwritten with the valid ones

612

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

613

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

614

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

615

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

616

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

617

618

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

619

#if defined(VEC_SIZE) && VEC_SIZE == 2

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

620

#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

621

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC

622

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

623

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

624

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

625

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

626

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

627

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

628

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

629

*

630

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

631

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

632

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

633

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

634

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

635

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

636

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

637

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

638

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

639

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

640

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

641

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

642

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

643

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

644

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

645

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

646

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

647

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

648

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

649

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

650

* @param[in] SRC_HEIGHT The source tensor's height

651

* @param[in] DST_WIDTH The destination tensor's width

652

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

653

*/

654

__kernel void winograd_output_transform_2x1_7x1_nhwc(

655

TENSOR4D_DECLARATION(src),

656

TENSOR4D_DECLARATION(dst),

657

#if defined(HAS_BIAS)

658

VECTOR_DECLARATION(bias),

659

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

660

int dst_size,

661

const int SRC_HEIGHT,

662

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

663

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

664

{

665

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

685

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

690

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

691

dst_size,

692

SRC_HEIGHT,

693

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

694

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

695

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

696

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

697

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

698

699

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

700

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

701

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

702

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

703

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

704

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

705

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

706

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

707

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

708

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

709

*

710

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

711

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

712

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

713

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

714

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

715

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

716

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

717

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

718

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

719

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

720

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

721

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

722

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

723

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

724

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

725

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

726

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

727

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

728

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

729

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

730

* @param[in] SRC_HEIGHT The source tensor's height

731

* @param[in] DST_WIDTH The destination tensor's width

732

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

733

*/

734

__kernel void winograd_output_transform_4x1_3x1_nhwc(

735

TENSOR4D_DECLARATION(src),

736

TENSOR4D_DECLARATION(dst),

737

#if defined(HAS_BIAS)

738

VECTOR_DECLARATION(bias),

739

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

740

int dst_size,

741

const int SRC_HEIGHT,

742

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

743

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

744

{

745

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

765

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

770

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

771

dst_size,

772

SRC_HEIGHT,

773

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

774

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

775

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

776

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

777

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

778

#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

779

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

780

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

781

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

782

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

783

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

784

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

785

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

786

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

787

*

788

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

789

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

790

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

791

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

792

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

793

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

794

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

795

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

796

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

797

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

798

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

799

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

800

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

801

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

802

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

803

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

804

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

805

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

806

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

807

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

808

* @param[in] SRC_HEIGHT The source tensor's height

809

* @param[in] DST_WIDTH The destination tensor's width

810

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

811

*/

812

__kernel void winograd_output_transform_4x1_5x1_nhwc(

813

TENSOR4D_DECLARATION(src),

814

TENSOR4D_DECLARATION(dst),

815

#if defined(HAS_BIAS)

816

VECTOR_DECLARATION(bias),

817

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

818

int dst_size,

819

const int SRC_HEIGHT,

820

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

821

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

822

{

823

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

843

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

848

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

849

dst_size,

850

SRC_HEIGHT,

851

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

852

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

853

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

854

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

855

#endif // defined(VEC_SIZE) && VEC_SIZE == 4

856

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

857

858

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

859

#if defined(VEC_SIZE) && VEC_SIZE == 2

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

860

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

861

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC

862

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

863

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

864

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

865

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

866

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

867

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

868

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

869

*

870

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

871

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

872

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

873

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

874

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

875

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

876

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

877

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

878

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

879

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

880

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

881

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

882

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

883

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

884

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

885

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

886

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

887

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

888

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

889

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

890

* @param[in] SRC_HEIGHT The source tensor's height

891

* @param[in] DST_WIDTH The destination tensor's width

892

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

893

*/

894

__kernel void winograd_output_transform_1x2_1x7_nhwc(

895

TENSOR4D_DECLARATION(src),

896

TENSOR4D_DECLARATION(dst),

897

#if defined(HAS_BIAS)

898

VECTOR_DECLARATION(bias),

899

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

900

int dst_size,

901

const int SRC_HEIGHT,

902

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

903

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

904

{

905

winograd_output_transform_2x2_7x7_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

925

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

930

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

931

dst_size,

932

SRC_HEIGHT,

933

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

934

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

935

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

936

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

937

#endif // defined(VEC_SIZE) && VEC_SIZE == 2

938

939

#if defined(VEC_SIZE) && VEC_SIZE == 4

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

940

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

941

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

942

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

943

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

944

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

945

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

946

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

947

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

948

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

949

*

950

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

951

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

952

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

953

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

954

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

955

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

956

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

957

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

958

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

959

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

960

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

961

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

962

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

963

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

964

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

965

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

966

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

967

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

968

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

969

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

970

* @param[in] SRC_HEIGHT The source tensor's height

971

* @param[in] DST_WIDTH The destination tensor's width

972

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

973

*/

974

__kernel void winograd_output_transform_1x4_1x3_nhwc(

975

TENSOR4D_DECLARATION(src),

976

TENSOR4D_DECLARATION(dst),

977

#if defined(HAS_BIAS)

978

VECTOR_DECLARATION(bias),

979

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

980

int dst_size,

981

const int SRC_HEIGHT,

982

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

983

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

984

{

985

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1005

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1010

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1011

dst_size,

1012

SRC_HEIGHT,

1013

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1014

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1015

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1016

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1017

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1018

#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1019

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1020

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1021

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1022

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1023

* @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24

1024

* @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32

1025

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1026

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1027

*

1028

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16

1029

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1030

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1031

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1032

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1033

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1034

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1035

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1036

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1037

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1038

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1039

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1040

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1041

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1042

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1043

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1044

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1045

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

1046

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1047

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1048

* @param[in] SRC_HEIGHT The source tensor's height

1049

* @param[in] DST_WIDTH The destination tensor's width

1050

* @param[in] DST_HEIGHT The destination tensor's height

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1051

*/

1052

__kernel void winograd_output_transform_1x4_1x5_nhwc(

1053

TENSOR4D_DECLARATION(src),

1054

TENSOR4D_DECLARATION(dst),

1055

#if defined(HAS_BIAS)

1056

VECTOR_DECLARATION(bias),

1057

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1058

int dst_size,

1059

const int SRC_HEIGHT,

1060

const int DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1061

const int DST_HEIGHT)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1062

{

1063

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes,

1083

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1088

#endif // defined(HAS_BIAS)

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1089

dst_size,

1090

SRC_HEIGHT,

1091

DST_WIDTH,

Gian Marco Iodice

2022-06-14 15:13:16 +0100

[diff] [blame]

1092

DST_HEIGHT);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1093

}

ramelg01

2022-02-08 09:38:17 +0000

[diff] [blame]

1094

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)

Adnan AlSinan