Blame - src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl - ml/ComputeLibrary

/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

70

*

71

* @note Data layout supported: NHWC

72

* @note Data type supported: F32/F16

73

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

74

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

75

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

76

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

77

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

78

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

79

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

80

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

81

*

82

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

83

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

84

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

85

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

86

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

87

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

88

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

89

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

90

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

91

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

92

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

93

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

94

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

95

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

96

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

97

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

98

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

99

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

100

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

101

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

102

*/

103

//! @endcond

104

__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(

105

TENSOR4D(src, BUFFER),

106

TENSOR4D(dst, BUFFER))

107

{

108

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

109

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

110

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

111

112

// All the tensor dimensions are passed at compile time.

113

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

114

#define _ISRC_WIDTH SRC_WIDTH

115

#define _ISRC_HEIGHT SRC_HEIGHT

116

#define _INUM_TILES_X NUM_TILES_X

117

#define _INUM_TILES_Y NUM_TILES_Y

118

119

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

120

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

125

126

TILE(DATA_TYPE, 6, 1, in);

127

TILE(DATA_TYPE, 6, 1, out);

128

129

// Initialize the input tile

130

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

136

T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

137

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

138

T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

139

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

140

141

TILE(DATA_TYPE, 6, 1, com);

142

143

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v *= 4.0f;

})

com[0].v = in[2].v - 4.f * in[0].v;

149

com[1].v = in[3].v - 4.f * in[1].v;

150

com[2].v = in[4].v - 4.f * in[2].v;

151

com[3].v = in[5].v - 4.f * in[3].v;

152

com[4].v = in[3].v - in[1].v;

153

com[4].v = com[4].v + com[4].v;

154

com[5].v = in[4].v - in[2].v;

155

156

out[0].v = com[2].v - com[0].v;

157

out[1].v = com[2].v + com[1].v;

158

out[2].v = com[2].v - com[1].v;

159

out[3].v = com[5].v + com[4].v;

160

out[4].v = com[5].v - com[4].v;

161

out[5].v = com[3].v - com[1].v;

162

163

TILE(uint, 6, 1, dst_indirect_y);

164

165

LOOP_UNROLLING(int, i, 0, 1, 6,

166

{

167

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

168

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;

169

})

170

171

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

172

173

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

174

175

TILE(DATA_TYPE, 36, 1, in);

176

177

// Initialize the input tile

178

LOOP_UNROLLING(int, i, 0, 1, 36,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

184

T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

185

186

TILE(DATA_TYPE, 6, 1, com);

187

TILE(DATA_TYPE, 36, 1, tmp);

188

189

LOOP_UNROLLING(int, i, 0, 1, 6,

190

{

191

com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;

192

com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;

193

com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;

194

com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;

195

com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;

196

com[4].v = com[4].v + com[4].v;

197

com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;

198

tmp[i + 0 * 6].v = com[2].v - com[0].v;

199

tmp[i + 1 * 6].v = com[2].v + com[1].v;

200

tmp[i + 2 * 6].v = com[2].v - com[1].v;

201

tmp[i + 3 * 6].v = com[5].v + com[4].v;

202

tmp[i + 4 * 6].v = com[5].v - com[4].v;

203

tmp[i + 5 * 6].v = com[3].v - com[1].v;

204

})

205

206

TILE(DATA_TYPE, 36, 1, out);

207

208

LOOP_UNROLLING(int, i, 0, 1, 6,

209

{

210

com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;

211

com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;

212

com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;

213

com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;

214

com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;

215

com[4].v = com[4].v + com[4].v;

216

com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;

217

out[i * 6 + 0].v = com[2].v - com[0].v;

218

out[i * 6 + 1].v = com[2].v + com[1].v;

219

out[i * 6 + 2].v = com[2].v - com[1].v;

220

out[i * 6 + 3].v = com[5].v + com[4].v;

221

out[i * 6 + 4].v = com[5].v - com[4].v;

222

out[i * 6 + 5].v = com[3].v - com[1].v;

223

})

224

225

// Compute destination address

226

TILE(uint, 36, 1, dst_indirect_y);

227

228

LOOP_UNROLLING(int, i, 0, 1, 36,

229

{

230

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

231

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;

232

})

233

234

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

235

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

236

}

237

238

//! @cond Doxygen_Suppress

239

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC

240

*

241

* @note Data layout supported: NHWC

242

* @note Data type supported: F32/F16

243

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

244

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

245

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

246

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

247

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

248

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

249

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

250

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

251

*

252

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

253

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

254

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

255

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

256

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

257

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

258

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

259

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

260

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

261

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

262

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

263

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

264

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

265

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

266

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

267

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

268

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

269

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

270

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

271

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

272

*/

273

//! @endcond

274

__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(

275

TENSOR4D(src, BUFFER),

276

TENSOR4D(dst, BUFFER))

277

{

278

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

279

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

280

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

281

282

// All the tensor dimensions are passed at compile time.

283

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

284

#define _ISRC_WIDTH SRC_WIDTH

285

#define _ISRC_HEIGHT SRC_HEIGHT

286

#define _INUM_TILES_X NUM_TILES_X

287

#define _INUM_TILES_Y NUM_TILES_Y

288

289

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

290

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

295

296

TILE(DATA_TYPE, 8, 1, in);

297

TILE(DATA_TYPE, 8, 1, out);

298

299

// Initialize the input tile

300

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

306

T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

307

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

308

T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

309

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

310

311

TILE(DATA_TYPE, 1, 8, com);

312

313

com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;

314

com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;

315

com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;

316

com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;

317

com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;

318

com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;

319

out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;

320

out[1].s[0] = com[0].s[0] + com[0].s[1];

321

out[2].s[0] = com[0].s[0] - com[0].s[1];

322

out[3].s[0] = com[0].s[3] + com[0].s[2];

323

out[4].s[0] = com[0].s[3] - com[0].s[2];

324

out[5].s[0] = com[0].s[4] + com[0].s[5];

325

out[6].s[0] = com[0].s[4] - com[0].s[5];

326

out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;

327

328

TILE(uint, 8, 1, dst_indirect_y);

329

330

LOOP_UNROLLING(int, i, 0, 1, 8,

331

{

332

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

333

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;

334

})

335

336

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

337

338

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

339

340

TILE(DATA_TYPE, 64, 1, in);

341

TILE(DATA_TYPE, 64, 1, out);

342

343

// Initialize the input tile

344

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

350

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

351

352

TILE(DATA_TYPE, 8, 8, com);

353

354

LOOP_UNROLLING(int, i, 0, 1, 8,

355

{

356

com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

357

com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x

358

com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

359

com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x

360

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

361

com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];

362

com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];

363

com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];

364

})

365

366

TILE(DATA_TYPE, 8, 8, tmp);

367

tmp[0].v = com[6].v;

368

tmp[1].v = com[0].v + com[1].v;

369

tmp[2].v = com[0].v - com[1].v;

370

tmp[3].v = com[2].v + com[3].v;

371

tmp[4].v = com[2].v - com[3].v;

372

tmp[5].v = com[4].v + com[5].v;

373

tmp[6].v = com[4].v - com[5].v;

374

tmp[7].v = com[7].v;

375

376

LOOP_UNROLLING(int, i, 0, 1, 8,

377

{

378

com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];

379

com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];

380

com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];

381

com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];

382

com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];

383

com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];

384

out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];

385

out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];

386

out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];

387

out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];

388

out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];

389

out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];

390

out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];

391

out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];

392

})

393

394

TILE(uint, 64, 1, dst_indirect_y);

395

396

LOOP_UNROLLING(int, i, 0, 1, 64,

397

{

398

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

399

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;

400

})

401

402

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

403

404

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

405

}

406

407

//! @cond Doxygen_Suppress

408

/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC

409

*

410

* @note Data layout supported: NHWC

411

* @note Data type supported: F32/F16

412

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

413

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

414

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

415

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

416

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

417

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

418

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

419

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

420

*

421

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

422

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

423

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

424

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

425

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

426

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

427

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

428

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

429

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

430

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

431

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

432

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

433

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

434

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

435

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

436

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

437

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

438

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

439

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

440

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

441

*/

442

//! @endcond

443

__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(

444

TENSOR4D(src, BUFFER),

445

TENSOR4D(dst, BUFFER))

446

{

447

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

448

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

449

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

450

451

// All the tensor dimensions are passed at compile time.

452

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

453

#define _ISRC_WIDTH SRC_WIDTH

454

#define _ISRC_HEIGHT SRC_HEIGHT

455

#define _INUM_TILES_X NUM_TILES_X

456

#define _INUM_TILES_Y NUM_TILES_Y

457

458

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

459

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

464

465

TILE(DATA_TYPE, 8, 1, in);

466

TILE(DATA_TYPE, 8, 1, out);

467

468

// Initialize the input tile

469

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

475

T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

476

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

477

T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

478

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

479

480

LOOP_UNROLLING(int, i, 0, 1, 8,

481

{

482

in[i].v *= (DATA_TYPE) - 36.0f;

483

})

484

485

TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };

486

487

com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;

488

com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;

489

com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;

490

com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;

491

com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;

492

com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;

493

out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;

494

out[1].s[0] = com[0].s[0] - com[0].s[1];

495

out[2].s[0] = com[0].s[0] + com[0].s[1];

496

out[3].s[0] = com[0].s[2] - com[0].s[3];

497

out[4].s[0] = com[0].s[2] + com[0].s[3];

498

out[5].s[0] = com[0].s[4] - com[0].s[5];

499

out[6].s[0] = com[0].s[4] + com[0].s[5];

500

out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;

501

502

TILE(uint, 8, 1, dst_indirect_y);

503

504

LOOP_UNROLLING(int, i, 0, 1, 8,

505

{

506

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

507

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;

508

})

509

510

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

511

512

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

513

514

TILE(DATA_TYPE, 64, 1, in);

515

TILE(DATA_TYPE, 64, 1, out);

516

517

// Initialize the input tile

518

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

524

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

525

526

TILE(DATA_TYPE, 8, 8, com);

527

528

LOOP_UNROLLING(int, i, 0, 1, 8,

529

{

530

com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

531

com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];

532

com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

533

com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];

534

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

535

com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];

536

com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];

537

com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];

538

})

539

540

TILE(DATA_TYPE, 8, 8, tmp);

541

tmp[0].v = com[6].v;

542

tmp[1].v = com[0].v - com[1].v;

543

tmp[2].v = com[0].v + com[1].v;

544

tmp[3].v = com[2].v - com[3].v;

545

tmp[4].v = com[2].v + com[3].v;

546

tmp[5].v = com[4].v - com[5].v;

547

tmp[6].v = com[4].v + com[5].v;

548

tmp[7].v = com[7].v;

549

550

LOOP_UNROLLING(int, i, 0, 1, 8,

551

{

552

com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];

553

com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];

554

com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];

555

com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];

556

com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];

557

com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];

558

out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];

559

out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];

560

out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];

561

out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];

562

out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];

563

out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];

564

out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];

565

out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];

566

})

567

568

TILE(uint, 64, 1, dst_indirect_y);

569

570

LOOP_UNROLLING(int, i, 0, 1, 64,

571

{

572

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

573

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;

574

})

575

576

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

577

578

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

579

}

580

581

//! @cond Doxygen_Suppress

582

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC

583

*

584

* @note Data layout supported: NHWC

585

* @note Data type supported: F32/F16

586

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

587

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

588

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

589

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

590

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

591

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

592

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

593

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

594

*

595

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

596

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

597

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

598

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

599

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

600

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

601

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

602

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

603

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

604

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

605

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

606

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

607

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

608

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

609

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

610

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

611

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

612

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

613

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

614

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

615

*/

616

//! @endcond

617

__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(

618

TENSOR4D(src, BUFFER),

619

TENSOR4D(dst, BUFFER))

620

{

621

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

641

}

642

643

//! @cond Doxygen_Suppress

644

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC

645

*

646

* @note Data layout supported: NHWC

647

* @note Data type supported: F32/F16

648

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

649

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

650

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

651

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

652

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

653

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

654

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

655

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

656

*

657

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

658

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

659

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

660

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

661

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

662

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

663

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

664

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

665

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

666

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

667

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

668

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

669

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

670

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

671

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

672

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

673

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

674

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

675

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

676

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

677

*/

678

//! @endcond

679

__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(

680

TENSOR4D(src, BUFFER),

681

TENSOR4D(dst, BUFFER))

682

{

683

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

703

}

704

705

//! @cond Doxygen_Suppress

706

/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC

707

*

708

* @note Data layout supported: NHWC

709

* @note Data type supported: F32/F16

710

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

711

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

712

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

713

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

714

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

715

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

716

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

717

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

718

*

719

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

720

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

721

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

722

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

723

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

724

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

725

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

726

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

727

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

728

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

729

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

730

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

731

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

732

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

733

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

734

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

735

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

736

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

737

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

738

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

739

*/

740

//! @endcond

741

__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(

742

TENSOR4D(src, BUFFER),

743

TENSOR4D(dst, BUFFER))

744

{

745

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

765

}

766

767

//! @cond Doxygen_Suppress

768

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC

769

*

770

* @note Data layout supported: NHWC

771

* @note Data type supported: F32/F16

772

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

773

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

774

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

775

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

776

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

777

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

778

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

779

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

780

*

781

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

782

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

783

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

784

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

785

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

786

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

787

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

788

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

789

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

790

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

791

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

792

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

793

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

794

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

795

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

796

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

797

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

798

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

799

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

800

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

801

*/

802

//! @endcond

803

__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(

804

TENSOR4D(src, BUFFER),

805

TENSOR4D(dst, BUFFER))

806

{

807

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

827

}

828

829

//! @cond Doxygen_Suppress

830

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC

831

*

832

* @note Data layout supported: NHWC

833

* @note Data type supported: F32/F16

834

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

835

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

836

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

837

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

838

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

839

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

840

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

841

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

842

*

843

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

844

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

845

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

846

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

847

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

848

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

849

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

850

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

851

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

852

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

853

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

854

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

855

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

856

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

857

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

858

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

859

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

860

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

861

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

862

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

863

*/

864

//! @endcond

865

__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(

866

TENSOR4D(src, BUFFER),

867

TENSOR4D(dst, BUFFER))

868

{

869

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

889

}

890

891

//! @cond Doxygen_Suppress

892

/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC

893

*

894

* @note Data layout supported: NHWC

895

* @note Data type supported: F32/F16

896

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

897

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

898

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

899

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

900

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

901

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

902

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

903

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

904

*

905

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

906

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

907

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

908

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

909

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

910

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

911

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

912

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

913

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

914

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

915

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

916

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

917

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

918

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

919

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

920

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

921

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

922

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

923

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

924

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

925

*/

926

//! @endcond

927

__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(

928

TENSOR4D(src, BUFFER),

929

TENSOR4D(dst, BUFFER))

930

{

931

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

951

}

952

#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)

953

#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)