Blame - src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl - ml/ComputeLibrary

2021-07-05 13:12:52 +0100

[diff] [blame]

69

//! @cond Doxygen_Suppress

70

/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

71

*

72

* @note Data layout supported: NHWC

73

* @note Data type supported: F32/F16

74

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

75

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

76

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

77

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

78

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

79

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

80

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

81

*

82

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

83

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

84

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

85

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

86

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

87

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

88

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

89

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

90

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

91

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

92

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

93

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

94

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

95

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

96

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

97

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

98

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

99

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

100

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

101

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

102

* @param[in] _ISRC_WIDTH The src tensor's width

103

* @param[in] _ISRC_HEIGHT The src tensor's height

104

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

105

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

106

*/

107

//! @endcond

108

__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(

109

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

110

TENSOR4D(dst, BUFFER),

111

const int _ISRC_WIDTH,

112

const int _ISRC_HEIGHT,

113

const int _INUM_TILES_X,

114

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

115

{

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

116

const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM

117

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

118

#if defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

119

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

120

#else // defined(IS_BATCHED)

121

const int bout = 0; // BATCH SIZE IDX

122

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

123

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

124

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

125

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

130

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

131

TILE(DATA_TYPE, 6, N0, in);

132

TILE(DATA_TYPE, 6, N0, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

133

134

// Initialize the input tile

135

LOOP_UNROLLING(int, i, 0, 1, 6,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

141

T_LOAD_NHWC(DATA_TYPE, 1, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

142

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

143

T_LOAD_NHWC(DATA_TYPE, 6, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

144

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

145

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

146

TILE(DATA_TYPE, 6, N0, com);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

147

148

LOOP_UNROLLING(int, i, 0, 1, 6,

149

{

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

150

in[i].v *= (DATA_TYPE)4.0f;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

151

})

152

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

153

com[0].v = in[2].v - (DATA_TYPE)4.f * in[0].v;

154

com[1].v = in[3].v - (DATA_TYPE)4.f * in[1].v;

155

com[2].v = in[4].v - (DATA_TYPE)4.f * in[2].v;

156

com[3].v = in[5].v - (DATA_TYPE)4.f * in[3].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

157

com[4].v = in[3].v - in[1].v;

158

com[4].v = com[4].v + com[4].v;

159

com[5].v = in[4].v - in[2].v;

160

161

out[0].v = com[2].v - com[0].v;

162

out[1].v = com[2].v + com[1].v;

163

out[2].v = com[2].v - com[1].v;

164

out[3].v = com[5].v + com[4].v;

165

out[4].v = com[5].v - com[4].v;

166

out[5].v = com[3].v - com[1].v;

167

168

TILE(uint, 6, 1, dst_indirect_y);

169

170

LOOP_UNROLLING(int, i, 0, 1, 6,

171

{

172

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

173

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;

174

})

175

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

176

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

177

178

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

179

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

180

TILE(DATA_TYPE, 36, N0, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

181

182

// Initialize the input tile

183

LOOP_UNROLLING(int, i, 0, 1, 36,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

189

T_LOAD_NHWC(DATA_TYPE, 6, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

190

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

191

TILE(DATA_TYPE, 6, N0, com);

192

TILE(DATA_TYPE, 36, N0, tmp);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

193

194

LOOP_UNROLLING(int, i, 0, 1, 6,

195

{

196

com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;

197

com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;

198

com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;

199

com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;

200

com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;

201

com[4].v = com[4].v + com[4].v;

202

com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;

203

tmp[i + 0 * 6].v = com[2].v - com[0].v;

204

tmp[i + 1 * 6].v = com[2].v + com[1].v;

205

tmp[i + 2 * 6].v = com[2].v - com[1].v;

206

tmp[i + 3 * 6].v = com[5].v + com[4].v;

207

tmp[i + 4 * 6].v = com[5].v - com[4].v;

208

tmp[i + 5 * 6].v = com[3].v - com[1].v;

209

})

210

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

211

TILE(DATA_TYPE, 36, N0, out);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

212

213

LOOP_UNROLLING(int, i, 0, 1, 6,

214

{

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

215

com[0].v = tmp[i * 6 + 2].v - (DATA_TYPE)4.f *tmp[i * 6 + 0].v;

216

com[1].v = tmp[i * 6 + 3].v - (DATA_TYPE)4.f *tmp[i * 6 + 1].v;

217

com[2].v = tmp[i * 6 + 4].v - (DATA_TYPE)4.f *tmp[i * 6 + 2].v;

218

com[3].v = tmp[i * 6 + 5].v - (DATA_TYPE)4.f *tmp[i * 6 + 3].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

219

com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;

220

com[4].v = com[4].v + com[4].v;

221

com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;

222

out[i * 6 + 0].v = com[2].v - com[0].v;

223

out[i * 6 + 1].v = com[2].v + com[1].v;

224

out[i * 6 + 2].v = com[2].v - com[1].v;

225

out[i * 6 + 3].v = com[5].v + com[4].v;

226

out[i * 6 + 4].v = com[5].v - com[4].v;

227

out[i * 6 + 5].v = com[3].v - com[1].v;

228

})

229

230

// Compute destination address

231

TILE(uint, 36, 1, dst_indirect_y);

232

233

LOOP_UNROLLING(int, i, 0, 1, 36,

234

{

235

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

236

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;

237

})

238

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

239

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

240

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

241

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

242

#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

243

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

244

#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

245

//! @cond Doxygen_Suppress

246

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC

247

*

248

* @note Data layout supported: NHWC

249

* @note Data type supported: F32/F16

250

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

251

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

252

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

253

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

254

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

255

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

256

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

257

*

258

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

259

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

260

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

261

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

262

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

263

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

264

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

265

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

266

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

267

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

268

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

269

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

270

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

271

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

272

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

273

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

274

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

275

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

276

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

277

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

278

* @param[in] _ISRC_WIDTH The src tensor's width

279

* @param[in] _ISRC_HEIGHT The src tensor's height

280

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

281

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

282

*/

283

//! @endcond

284

__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(

285

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

286

TENSOR4D(dst, BUFFER),

287

const int _ISRC_WIDTH,

288

const int _ISRC_HEIGHT,

289

const int _INUM_TILES_X,

290

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

291

{

292

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

293

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

294

#if defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

295

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

296

#else // defined(IS_BATCHED)

297

const int bout = 0; // BATCH SIZE IDX

298

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

299

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

300

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

301

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

306

307

TILE(DATA_TYPE, 8, 1, in);

308

TILE(DATA_TYPE, 8, 1, out);

309

310

// Initialize the input tile

311

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

317

T_LOAD_NHWC(DATA_TYPE, 1, 8, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

318

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

319

T_LOAD_NHWC(DATA_TYPE, 8, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

320

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

321

322

TILE(DATA_TYPE, 1, 8, com);

323

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

324

com[0].s[0] = in[2].v - (DATA_TYPE)4.25f * in[4].v + in[6].v;

325

com[0].s[1] = in[1].v - (DATA_TYPE)4.25f * in[3].v + in[5].v;

326

com[0].s[2] = (DATA_TYPE)0.5f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)2.0f * in[5].v;

327

com[0].s[3] = (DATA_TYPE)0.25f * in[2].v - (DATA_TYPE)1.25f * in[4].v + in[6].v;

328

com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;

329

com[0].s[5] = (DATA_TYPE)2.0f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)0.5f * in[5].v;

330

out[0].s[0] = in[0].v - 5.25f * in[2].v + (DATA_TYPE)5.25f * in[4].v - in[6].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

331

out[1].s[0] = com[0].s[0] + com[0].s[1];

332

out[2].s[0] = com[0].s[0] - com[0].s[1];

333

out[3].s[0] = com[0].s[3] + com[0].s[2];

334

out[4].s[0] = com[0].s[3] - com[0].s[2];

335

out[5].s[0] = com[0].s[4] + com[0].s[5];

336

out[6].s[0] = com[0].s[4] - com[0].s[5];

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

337

out[7].s[0] = -in[1].v + (DATA_TYPE)5.25f * in[3].v - (DATA_TYPE)5.25f * in[5].v + in[7].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

338

339

TILE(uint, 8, 1, dst_indirect_y);

340

341

LOOP_UNROLLING(int, i, 0, 1, 8,

342

{

343

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

344

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;

345

})

346

347

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

348

349

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

350

351

TILE(DATA_TYPE, 64, 1, in);

352

TILE(DATA_TYPE, 64, 1, out);

353

354

// Initialize the input tile

355

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

361

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

362

363

TILE(DATA_TYPE, 8, 8, com);

364

365

LOOP_UNROLLING(int, i, 0, 1, 8,

366

{

367

com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

368

com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x

369

com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

370

com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x

371

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

372

com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];

373

com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];

374

com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];

375

})

376

377

TILE(DATA_TYPE, 8, 8, tmp);

378

tmp[0].v = com[6].v;

379

tmp[1].v = com[0].v + com[1].v;

380

tmp[2].v = com[0].v - com[1].v;

381

tmp[3].v = com[2].v + com[3].v;

382

tmp[4].v = com[2].v - com[3].v;

383

tmp[5].v = com[4].v + com[5].v;

384

tmp[6].v = com[4].v - com[5].v;

385

tmp[7].v = com[7].v;

386

387

LOOP_UNROLLING(int, i, 0, 1, 8,

388

{

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

389

com[0].s[0] = tmp[i].s[2] - (DATA_TYPE)4.25f * tmp[i].s[4] + tmp[i].s[6];

390

com[0].s[1] = tmp[i].s[1] - (DATA_TYPE)4.25f * tmp[i].s[3] + tmp[i].s[5];

391

com[0].s[2] = (DATA_TYPE)0.5f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];

392

com[0].s[3] = (DATA_TYPE)0.25f * tmp[i].s[2] - (DATA_TYPE)1.25f * tmp[i].s[4] + tmp[i].s[6];

393

com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];

394

com[0].s[5] = (DATA_TYPE)2.0f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)0.5f * tmp[i].s[5];

395

out[i * 8 + 0].s[0] = tmp[i].s[0] - (DATA_TYPE)5.25f * tmp[i].s[2] + (DATA_TYPE)5.25f * tmp[i].s[4] - tmp[i].s[6];

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

396

out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];

397

out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];

398

out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];

399

out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];

400

out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];

401

out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

402

out[i * 8 + 7].s[0] = -tmp[i].s[1] + (DATA_TYPE)5.25f * tmp[i].s[3] - (DATA_TYPE)5.25f * tmp[i].s[5] + tmp[i].s[7];

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

403

})

404

405

TILE(uint, 64, 1, dst_indirect_y);

406

407

LOOP_UNROLLING(int, i, 0, 1, 64,

408

{

409

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

410

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;

411

})

412

413

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

414

415

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

416

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

417

#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

418

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

419

#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

420

//! @cond Doxygen_Suppress

421

/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC

422

*

423

* @note Data layout supported: NHWC

424

* @note Data type supported: F32/F16

425

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

426

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

427

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

428

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

429

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

430

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

431

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

432

*

433

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

434

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

435

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

436

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

437

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

438

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

439

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

440

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

441

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

442

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

443

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

444

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

445

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

446

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

447

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

448

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

449

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

450

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

451

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

452

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

453

* @param[in] _ISRC_WIDTH The src tensor's width

454

* @param[in] _ISRC_HEIGHT The src tensor's height

455

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

456

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

457

*/

458

//! @endcond

459

__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(

460

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

461

TENSOR4D(dst, BUFFER),

462

const int _ISRC_WIDTH,

463

const int _ISRC_HEIGHT,

464

const int _INUM_TILES_X,

465

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

466

{

467

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

468

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

469

#if defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

470

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

471

#else // defined(IS_BATCHED)

472

const int bout = 0; // BATCH SIZE IDX

473

#endif // defined(IS_BATCHED)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

474

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

475

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

476

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

x -= PAD_LEFT;

y -= PAD_TOP;

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

481

482

TILE(DATA_TYPE, 8, 1, in);

483

TILE(DATA_TYPE, 8, 1, out);

484

485

// Initialize the input tile

486

LOOP_UNROLLING(int, i, 0, 1, 8,

{

in[i].v = 0;

})

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

492

T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

493

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

494

T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

495

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

496

497

LOOP_UNROLLING(int, i, 0, 1, 8,

498

{

499

in[i].v *= (DATA_TYPE) - 36.0f;

500

})

501

502

TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };

503

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

504

com[0].s[0] = (DATA_TYPE)36.0f * in[2].v - (DATA_TYPE)13.0f * in[4].v + in[6].v;

505

com[0].s[1] = (DATA_TYPE)36.0f * in[1].v - (DATA_TYPE)13.0f * in[3].v + (DATA_TYPE)1.0f * in[5].v;

506

com[0].s[2] = (DATA_TYPE)9.0f * in[2].v - (DATA_TYPE)10.0f * in[4].v + in[6].v;

507

com[0].s[3] = (DATA_TYPE)18.0f * in[1].v - (DATA_TYPE)20.0f * in[3].v + (DATA_TYPE)2.0f * in[5].v;

508

com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;

509

com[0].s[5] = (DATA_TYPE)12.0f * in[1].v - (DATA_TYPE)15.0f * in[3].v + (DATA_TYPE)3.0f * in[5].v;

510

out[0].s[0] = (DATA_TYPE) - 36.0f * in[0].v + (DATA_TYPE)49.0f * in[2].v + -(DATA_TYPE)14.0f * in[4].v + in[6].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

511

out[1].s[0] = com[0].s[0] - com[0].s[1];

512

out[2].s[0] = com[0].s[0] + com[0].s[1];

513

out[3].s[0] = com[0].s[2] - com[0].s[3];

514

out[4].s[0] = com[0].s[2] + com[0].s[3];

515

out[5].s[0] = com[0].s[4] - com[0].s[5];

516

out[6].s[0] = com[0].s[4] + com[0].s[5];

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

517

out[7].s[0] = -(DATA_TYPE)36.0f * in[1].v + (DATA_TYPE)0.0f * in[2].v + (DATA_TYPE)49.0f * in[3].v - (DATA_TYPE)14.0f * in[5].v + in[7].v;

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

518

519

TILE(uint, 8, 1, dst_indirect_y);

520

521

LOOP_UNROLLING(int, i, 0, 1, 8,

522

{

523

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

524

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;

525

})

526

527

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

528

529

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

530

531

TILE(DATA_TYPE, 64, 1, in);

532

TILE(DATA_TYPE, 64, 1, out);

533

534

// Initialize the input tile

535

LOOP_UNROLLING(int, i, 0, 1, 64,

{

in[i].v = 0;

})

// Load the tile from a NHWC tensor

541

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

542

543

TILE(DATA_TYPE, 8, 8, com);

544

545

LOOP_UNROLLING(int, i, 0, 1, 8,

546

{

547

com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

548

com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];

549

com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

550

com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];

551

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

552

com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];

553

com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];

554

com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];

555

})

556

557

TILE(DATA_TYPE, 8, 8, tmp);

558

tmp[0].v = com[6].v;

559

tmp[1].v = com[0].v - com[1].v;

560

tmp[2].v = com[0].v + com[1].v;

561

tmp[3].v = com[2].v - com[3].v;

562

tmp[4].v = com[2].v + com[3].v;

563

tmp[5].v = com[4].v - com[5].v;

564

tmp[6].v = com[4].v + com[5].v;

565

tmp[7].v = com[7].v;

566

567

LOOP_UNROLLING(int, i, 0, 1, 8,

568

{

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

569

com[0].s[0] = (DATA_TYPE)36.0f * tmp[i].s[2] - (DATA_TYPE)13.0f * tmp[i].s[4] + tmp[i].s[6];

570

com[0].s[1] = (DATA_TYPE)36.0f * tmp[i].s[1] - (DATA_TYPE)13.0f * tmp[i].s[3] + (DATA_TYPE)1.0f * tmp[i].s[5];

571

com[0].s[2] = (DATA_TYPE)9.0f * tmp[i].s[2] - (DATA_TYPE)10.0f * tmp[i].s[4] + tmp[i].s[6];

572

com[0].s[3] = (DATA_TYPE)18.0f * tmp[i].s[1] - (DATA_TYPE)20.0f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];

573

com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];

574

com[0].s[5] = (DATA_TYPE)12.0f * tmp[i].s[1] - (DATA_TYPE)15.0f * tmp[i].s[3] + (DATA_TYPE)3.0f * tmp[i].s[5];

575

out[i * 8 + 0].s[0] = (DATA_TYPE) - 36.0f * tmp[i].s[0] + (DATA_TYPE)49.0f * tmp[i].s[2] + -(DATA_TYPE)14.0f * tmp[i].s[4] + tmp[i].s[6];

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

576

out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];

577

out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];

578

out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];

579

out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];

580

out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];

581

out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];

Gian Marco Iodice

2023-04-14 12:20:58 +0100

[diff] [blame]

582

out[i * 8 + 7].s[0] = -(DATA_TYPE)36.0f * tmp[i].s[1] + (DATA_TYPE)0.0f * tmp[i].s[2] + (DATA_TYPE)49.0f * tmp[i].s[3] - (DATA_TYPE)14.0f * tmp[i].s[5] + tmp[i].s[7];

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

583

})

584

585

TILE(uint, 64, 1, dst_indirect_y);

586

587

LOOP_UNROLLING(int, i, 0, 1, 64,

588

{

589

dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;

590

dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;

591

})

592

593

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

594

595

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

596

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

597

#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

598

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

599

#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

600

//! @cond Doxygen_Suppress

601

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC

602

*

603

* @note Data layout supported: NHWC

604

* @note Data type supported: F32/F16

605

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

606

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

607

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

608

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

609

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

610

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

611

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

612

*

613

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

614

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

615

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

616

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

617

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

618

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

619

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

620

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

621

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

622

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

623

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

624

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

625

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

626

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

627

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

628

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

629

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

630

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

631

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

632

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

633

* @param[in] _ISRC_WIDTH The src tensor's width

634

* @param[in] _ISRC_HEIGHT The src tensor's height

635

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

636

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

637

*/

638

//! @endcond

639

__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(

640

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

641

TENSOR4D(dst, BUFFER),

642

const int _ISRC_WIDTH,

643

const int _ISRC_HEIGHT,

644

const int _INUM_TILES_X,

645

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

646

{

647

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

666

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

671

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

672

#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

673

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

674

#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

675

//! @cond Doxygen_Suppress

676

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC

677

*

678

* @note Data layout supported: NHWC

679

* @note Data type supported: F32/F16

680

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

681

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

682

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

683

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

684

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

685

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

686

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

687

*

688

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

689

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

690

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

691

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

692

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

693

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

694

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

695

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

696

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

697

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

698

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

699

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

700

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

701

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

702

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

703

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

704

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

705

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

706

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

707

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

708

* @param[in] _ISRC_WIDTH The src tensor's width

709

* @param[in] _ISRC_HEIGHT The src tensor's height

710

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

711

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

712

*/

713

//! @endcond

714

__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(

715

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

716

TENSOR4D(dst, BUFFER),

717

const int _ISRC_WIDTH,

718

const int _ISRC_HEIGHT,

719

const int _INUM_TILES_X,

720

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

721

{

722

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

741

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

746

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

747

#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

748

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

749

#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

750

//! @cond Doxygen_Suppress

751

/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC

752

*

753

* @note Data layout supported: NHWC

754

* @note Data type supported: F32/F16

755

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

756

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

757

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

758

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

759

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

760

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

761

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

762

*

763

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

764

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

765

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

766

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

767

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

768

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

769

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

770

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

771

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

772

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

773

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

774

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

775

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

776

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

777

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

778

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

779

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

780

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

781

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

782

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

783

* @param[in] _ISRC_WIDTH The src tensor's width

784

* @param[in] _ISRC_HEIGHT The src tensor's height

785

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

786

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

787

*/

788

//! @endcond

789

__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(

790

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

791

TENSOR4D(dst, BUFFER),

792

const int _ISRC_WIDTH,

793

const int _ISRC_HEIGHT,

794

const int _INUM_TILES_X,

795

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

796

{

797

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

816

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

821

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

822

#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

823

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

824

#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

825

//! @cond Doxygen_Suppress

826

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC

827

*

828

* @note Data layout supported: NHWC

829

* @note Data type supported: F32/F16

830

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

831

*

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

832

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

833

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

834

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

835

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

836

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

837

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

838

*

839

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

840

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

841

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

842

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

843

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

844

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

845

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

846

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

847

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

848

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

849

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

850

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

851

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

852

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

853

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

854

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

855

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

856

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

857

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

858

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

859

* @param[in] _ISRC_WIDTH The src tensor's width

860

* @param[in] _ISRC_HEIGHT The src tensor's height

861

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

862

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

863

*/

864

//! @endcond

865

__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(

866

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

867

TENSOR4D(dst, BUFFER),

868

const int _ISRC_WIDTH,

869

const int _ISRC_HEIGHT,

870

const int _INUM_TILES_X,

871

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

872

{

873

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

892

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

897

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

898

#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

899

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

900

#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

901

//! @cond Doxygen_Suppress

902

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC

903

*

904

* @note Data layout supported: NHWC

905

* @note Data type supported: F32/F16

906

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

907

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

908

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

909

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

910

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

911

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

912

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

913

*

914

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

915

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

916

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

917

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

918

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

919

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

920

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

921

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

922

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

923

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

924

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

925

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

926

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

927

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

928

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

929

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

930

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

931

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

932

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

933

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

934

* @param[in] _ISRC_WIDTH The src tensor's width

935

* @param[in] _ISRC_HEIGHT The src tensor's height

936

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

937

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

938

*/

939

//! @endcond

940

__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(

941

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

942

TENSOR4D(dst, BUFFER),

943

const int _ISRC_WIDTH,

944

const int _ISRC_HEIGHT,

945

const int _INUM_TILES_X,

946

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

947

{

948

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

967

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

972

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

973

#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

974

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

975

#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

976

//! @cond Doxygen_Suppress

977

/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC

978

*

979

* @note Data layout supported: NHWC

980

* @note Data type supported: F32/F16

981

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

982

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

983

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

984

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

985

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

986

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

987

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

988

*

989

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

990

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

991

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

992

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

993

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

994

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

995

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

996

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

997

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

998

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

999

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1000

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1001

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1002

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1003

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1004

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1005

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1006

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1007

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1008

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

1009

* @param[in] _ISRC_WIDTH The src tensor's width

1010

* @param[in] _ISRC_HEIGHT The src tensor's height

1011

* @param[in] _INUM_TILES_X The number of tiles in the X dimension

1012

* @param[in] _INUM_TILES_Y The number of tiles in the Y dimension

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1013

*/

1014

//! @endcond

1015

__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(

1016

TENSOR4D(src, BUFFER),

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

1017

TENSOR4D(dst, BUFFER),

1018

const int _ISRC_WIDTH,

1019

const int _ISRC_HEIGHT,

1020

const int _INUM_TILES_X,

1021

const int _INUM_TILES_Y)

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1022

{

1023

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

1042

dst_offset_first_element_in_bytes,

_ISRC_WIDTH,

_ISRC_HEIGHT,

_INUM_TILES_X,

_INUM_TILES_Y);

Adnan AlSinan

2021-07-05 13:12:52 +0100

[diff] [blame]

1047

}

ramelg01

2022-02-04 20:49:14 +0000

[diff] [blame]

1048

#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)

1049

#endif // defined(NHWC)

Adnan AlSinan