Blame - src/core/CL/cl_kernels/winograd_input_transform.cl - ml/ComputeLibrary

2018-10-29 18:01:52 +0000

[diff] [blame]

113

#else /* defined(SRC_DEPTH) */

114

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

115

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

117

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

118

119

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

120

VEC_DATA_TYPE(DATA_TYPE, 4)

121

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

122

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

123

VEC_DATA_TYPE(DATA_TYPE, 4)

124

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

125

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

126

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

127

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

128

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

129

VEC_DATA_TYPE(DATA_TYPE, 4)

130

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

131

VEC_DATA_TYPE(DATA_TYPE, 4)

132

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

133

VEC_DATA_TYPE(DATA_TYPE, 4)

134

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

135

VEC_DATA_TYPE(DATA_TYPE, 4)

136

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

137

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

138

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

139

VEC_DATA_TYPE(DATA_TYPE, 4)

140

tmp0 = in_row0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

141

142

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

143

tmp0 -= in_row2;

144

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

145

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

146

DATA_TYPE out00 = tmp0.s0 - tmp0.s2;

147

DATA_TYPE out01 = tmp0.s1 + tmp0.s2;

148

DATA_TYPE out02 = tmp0.s2 - tmp0.s1;

149

DATA_TYPE out03 = tmp0.s1 - tmp0.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

150

151

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

152

VEC_DATA_TYPE(DATA_TYPE, 4)

153

tmp1 = in_row1 + in_row2;

154

VEC_DATA_TYPE(DATA_TYPE, 4)

155

tmp2 = in_row2 - in_row1;

156

VEC_DATA_TYPE(DATA_TYPE, 4)

157

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

158

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

159

DATA_TYPE out10 = tmp1.s0 - tmp1.s2;

160

DATA_TYPE out11 = tmp1.s1 + tmp1.s2;

161

DATA_TYPE out12 = tmp1.s2 - tmp1.s1;

162

DATA_TYPE out13 = tmp1.s1 - tmp1.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

163

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

164

DATA_TYPE out20 = tmp2.s0 - tmp2.s2;

165

DATA_TYPE out21 = tmp2.s1 + tmp2.s2;

166

DATA_TYPE out22 = tmp2.s2 - tmp2.s1;

167

DATA_TYPE out23 = tmp2.s1 - tmp2.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

168

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

169

DATA_TYPE out30 = tmp3.s0 - tmp3.s2;

170

DATA_TYPE out31 = tmp3.s1 + tmp3.s2;

171

DATA_TYPE out32 = tmp3.s2 - tmp3.s1;

172

DATA_TYPE out33 = tmp3.s1 - tmp3.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

173

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

174

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

175

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

176

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

177

#else /* defined(SRC_DEPTH) */

178

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

179

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

180

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

181

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;

182

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;

183

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;

184

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

185

186

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

187

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;

188

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;

189

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;

190

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;

191

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;

192

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;

193

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;

194

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;

195

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;

196

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;

197

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;

198

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

199

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

200

}

201

202

/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2

203

*

204

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

205

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

206

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

207

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

208

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

209

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

210

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

211

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

212

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

213

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

214

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

215

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

216

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

217

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

218

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

219

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

220

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

221

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

222

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

223

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

224

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

225

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

226

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

228

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

229

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

230

*/

231

__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(

232

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

233

TENSOR3D_DECLARATION(dst),

234

uint src_stride_w,

235

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

236

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

237

const int x = get_global_id(0);

238

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

239

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

240

const int z = (get_global_id(2) * 2) % SRC_DEPTH;

241

const int b = (get_global_id(2) * 2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

242

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

243

const int z = get_global_id(2) * 2;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

244

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

245

246

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

247

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

248

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

249

#else /* defined(SRC_DEPTH) */

250

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

251

#endif /* defined(SRC_DEPTH) */

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

252

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

253

254

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

255

VEC_DATA_TYPE(DATA_TYPE, 4)

256

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

257

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

258

VEC_DATA_TYPE(DATA_TYPE, 4)

259

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

260

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

261

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

262

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

263

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

264

VEC_DATA_TYPE(DATA_TYPE, 4)

265

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

266

VEC_DATA_TYPE(DATA_TYPE, 4)

267

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

268

VEC_DATA_TYPE(DATA_TYPE, 4)

269

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

270

VEC_DATA_TYPE(DATA_TYPE, 4)

271

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

272

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

273

274

src_addr += src_stride_z;

275

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

276

VEC_DATA_TYPE(DATA_TYPE, 4)

277

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

278

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

279

VEC_DATA_TYPE(DATA_TYPE, 4)

280

in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

281

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

282

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

283

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

284

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

285

VEC_DATA_TYPE(DATA_TYPE, 4)

286

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

287

VEC_DATA_TYPE(DATA_TYPE, 4)

288

in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

289

VEC_DATA_TYPE(DATA_TYPE, 4)

290

in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

291

VEC_DATA_TYPE(DATA_TYPE, 4)

292

in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

293

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

294

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

295

VEC_DATA_TYPE(DATA_TYPE, 4)

296

tmp0 = in_row0;

297

VEC_DATA_TYPE(DATA_TYPE, 4)

298

tmp4 = in_row4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

299

300

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

301

tmp0 -= in_row2;

302

tmp4 -= in_row6;

303

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

304

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

305

VEC_DATA_TYPE(DATA_TYPE, 2)

306

out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);

307

VEC_DATA_TYPE(DATA_TYPE, 2)

308

out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);

309

VEC_DATA_TYPE(DATA_TYPE, 2)

310

out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);

311

VEC_DATA_TYPE(DATA_TYPE, 2)

312

out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

313

314

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

315

VEC_DATA_TYPE(DATA_TYPE, 4)

316

tmp1 = in_row1 + in_row2;

317

VEC_DATA_TYPE(DATA_TYPE, 4)

318

tmp2 = in_row2 - in_row1;

319

VEC_DATA_TYPE(DATA_TYPE, 4)

320

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

321

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

322

VEC_DATA_TYPE(DATA_TYPE, 4)

323

tmp5 = in_row5 + in_row6;

324

VEC_DATA_TYPE(DATA_TYPE, 4)

325

tmp6 = in_row6 - in_row5;

326

VEC_DATA_TYPE(DATA_TYPE, 4)

327

tmp7 = in_row5 - in_row7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

328

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

329

VEC_DATA_TYPE(DATA_TYPE, 2)

330

out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);

331

VEC_DATA_TYPE(DATA_TYPE, 2)

332

out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);

333

VEC_DATA_TYPE(DATA_TYPE, 2)

334

out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);

335

VEC_DATA_TYPE(DATA_TYPE, 2)

336

out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

337

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

338

VEC_DATA_TYPE(DATA_TYPE, 2)

339

out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);

340

VEC_DATA_TYPE(DATA_TYPE, 2)

341

out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);

342

VEC_DATA_TYPE(DATA_TYPE, 2)

343

out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);

344

VEC_DATA_TYPE(DATA_TYPE, 2)

345

out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

346

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

347

VEC_DATA_TYPE(DATA_TYPE, 2)

348

out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);

349

VEC_DATA_TYPE(DATA_TYPE, 2)

350

out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);

351

VEC_DATA_TYPE(DATA_TYPE, 2)

352

out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);

353

VEC_DATA_TYPE(DATA_TYPE, 2)

354

out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

356

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

357

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

358

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

359

#else /* defined(SRC_DEPTH) */

360

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

361

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

362

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

363

vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));

364

vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));

365

vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));

366

vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

367

368

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

369

vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));

370

vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));

371

vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));

372

vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));

373

vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));

374

vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));

375

vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));

376

vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));

377

vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));

378

vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));

379

vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));

380

vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

381

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

382

}

383

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

384

/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

385

*

386

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

387

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

388

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

389

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

390

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

391

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

392

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

393

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

394

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

395

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

396

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

397

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

398

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

399

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

400

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

401

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

402

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

403

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

404

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

405

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

406

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

407

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

408

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

409

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

410

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

411

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

412

*/

413

__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(

414

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

415

TENSOR3D_DECLARATION(dst),

416

uint src_stride_w,

417

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

418

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

419

const int x = get_global_id(0);

420

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

421

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

422

const int z = get_global_id(2) % SRC_DEPTH;

423

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

424

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

425

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

426

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

427

428

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

429

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

430

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

431

#else /* defined(SRC_DEPTH) */

432

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

433

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

434

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

435

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

436

437

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

438

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

439

VEC_DATA_TYPE(DATA_TYPE, 4)

440

d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

441

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

442

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

443

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

444

VEC_DATA_TYPE(DATA_TYPE, 2)

445

d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

446

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

447

#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

448

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

449

VEC_DATA_TYPE(DATA_TYPE, 4)

450

d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

451

VEC_DATA_TYPE(DATA_TYPE, 2)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

452

d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

453

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

454

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

455

DATA_TYPE out0 = 0.0f;

456

DATA_TYPE out1 = 0.0f;

457

DATA_TYPE out2 = 0.0f;

458

DATA_TYPE out3 = 0.0f;

459

DATA_TYPE out4 = 0.0f;

460

DATA_TYPE out5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

461

462

// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]

463

out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;

464

out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;

465

out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;

466

out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;

467

out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;

468

out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;

469

470

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

471

// Row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

472

VEC_DATA_TYPE(DATA_TYPE, 4)

473

d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

474

VEC_DATA_TYPE(DATA_TYPE, 2)

475

d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

476

477

// k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

478

DATA_TYPE k0 = d41.s0;

479

DATA_TYPE k1 = d41.s0;

480

DATA_TYPE k2 = d41.s0;

481

DATA_TYPE k3 = d41.s0;

482

DATA_TYPE k4 = d41.s0;

483

DATA_TYPE k5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

484

485

k0 += 4.0f * d40.s0 - 5.0f * d40.s2;

486

k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;

487

k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;

488

k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;

489

k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;

490

k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;

out0 += k0;

out1 += k1;

out2 += k2;

out3 += k3;

out4 += k4;

out5 += k5;

// Row2

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

500

VEC_DATA_TYPE(DATA_TYPE, 4)

501

d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

502

VEC_DATA_TYPE(DATA_TYPE, 2)

503

d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

504

505

out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;

506

out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;

507

out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;

508

out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;

509

out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;

510

out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;

511

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

512

513

// Compute destination address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

514

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

515

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

516

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

517

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

518

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

519

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

520

uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

521

522

*(dst_addr) = out0;

523

dst_addr += dst_plane_stride;

524

*(dst_addr) = out1;

525

dst_addr += dst_plane_stride;

526

*(dst_addr) = out2;

527

dst_addr += dst_plane_stride;

528

*(dst_addr) = out3;

529

dst_addr += dst_plane_stride;

530

*(dst_addr) = out4;

531

dst_addr += dst_plane_stride;

532

*(dst_addr) = out5;

533

dst_addr += dst_plane_stride;

534

535

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

DATA_TYPE out6 = k0;

DATA_TYPE out7 = k1;

DATA_TYPE out8 = k2;

DATA_TYPE out9 = k3;

DATA_TYPE out10 = k4;

541

DATA_TYPE out11 = k5;

542

DATA_TYPE out12 = k0;

543

DATA_TYPE out13 = k1;

544

DATA_TYPE out14 = k2;

545

DATA_TYPE out15 = k3;

546

DATA_TYPE out16 = k4;

547

DATA_TYPE out17 = k5;

548

DATA_TYPE out18 = k0;

549

DATA_TYPE out19 = k1;

550

DATA_TYPE out20 = k2;

551

DATA_TYPE out21 = k3;

552

DATA_TYPE out22 = k4;

553

DATA_TYPE out23 = k5;

554

DATA_TYPE out24 = k0;

555

DATA_TYPE out25 = k1;

556

DATA_TYPE out26 = k2;

557

DATA_TYPE out27 = k3;

558

DATA_TYPE out28 = k4;

559

DATA_TYPE out29 = k5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

560

561

// Row1

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

562

VEC_DATA_TYPE(DATA_TYPE, 4)

563

d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

564

VEC_DATA_TYPE(DATA_TYPE, 2)

565

d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

566

567

// Row3

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

568

VEC_DATA_TYPE(DATA_TYPE, 4)

569

d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

570

VEC_DATA_TYPE(DATA_TYPE, 2)

571

d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

572

573

// Compute common parts for the channels between [6, 29]

574

// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]

575

// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

576

DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;

577

DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;

578

DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;

579

DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;

580

DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;

581

DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;

582

DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;

583

DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;

584

DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;

585

DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;

586

DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;

587

DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

588

589

// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]

590

// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

591

DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;

592

DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0

593

DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0

594

DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;

595

DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;

596

DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3

597

DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0

598

DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;

599

DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;

600

DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)

601

DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1

602

DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

603

604

out6 += part0 - part1;

605

out12 += part0 + part1;

606

out7 += part2 + part3 + part4 + part5;

607

out8 += part2 - part3 + part4 - part5;

608

out13 += part2 + part3 - part4 - part5;

609

out14 += part2 - part3 - part4 + part5;

610

out9 += part6 + part7 + part8 + part9;

611

out10 += part6 - part7 + part8 - part9;

612

out15 += part6 - part7 - part8 + part9;

613

out16 += part6 + part7 - part8 - part9;

614

out11 += part10 + part11;

615

out17 += part10 - part11;

616

617

out18 += part13 - part12;

618

out24 += part13 + part12;

619

out19 += part14 + part15 + part16 + part17;

620

out20 += part14 - part15 + part16 - part17;

621

out25 += part14 - part15 - part16 + part17;

622

out26 += part14 + part15 - part16 - part17;

623

out21 += part18 + part19 + part20 + part21;

624

out22 += part18 - part19 + part20 - part21;

625

out27 += part18 - part19 - part20 + part21;

626

out28 += part18 + part19 - part20 - part21;

627

out23 += part22 + part23;

628

out29 += part22 - part23;

629

630

*(dst_addr) = out6;

631

dst_addr += dst_plane_stride;

632

*(dst_addr) = out7;

633

dst_addr += dst_plane_stride;

634

*(dst_addr) = out8;

635

dst_addr += dst_plane_stride;

636

*(dst_addr) = out9;

637

dst_addr += dst_plane_stride;

638

*(dst_addr) = out10;

639

dst_addr += dst_plane_stride;

640

*(dst_addr) = out11;

641

dst_addr += dst_plane_stride;

642

*(dst_addr) = out12;

643

dst_addr += dst_plane_stride;

644

*(dst_addr) = out13;

645

dst_addr += dst_plane_stride;

646

*(dst_addr) = out14;

647

dst_addr += dst_plane_stride;

648

*(dst_addr) = out15;

649

dst_addr += dst_plane_stride;

650

*(dst_addr) = out16;

651

dst_addr += dst_plane_stride;

652

*(dst_addr) = out17;

653

dst_addr += dst_plane_stride;

654

655

*(dst_addr) = out18;

656

dst_addr += dst_plane_stride;

657

*(dst_addr) = out19;

658

dst_addr += dst_plane_stride;

659

*(dst_addr) = out20;

660

dst_addr += dst_plane_stride;

661

*(dst_addr) = out21;

662

dst_addr += dst_plane_stride;

663

*(dst_addr) = out22;

664

dst_addr += dst_plane_stride;

665

*(dst_addr) = out23;

666

dst_addr += dst_plane_stride;

667

*(dst_addr) = out24;

668

dst_addr += dst_plane_stride;

669

*(dst_addr) = out25;

670

dst_addr += dst_plane_stride;

671

*(dst_addr) = out26;

672

dst_addr += dst_plane_stride;

673

*(dst_addr) = out27;

674

dst_addr += dst_plane_stride;

675

*(dst_addr) = out28;

676

dst_addr += dst_plane_stride;

677

*(dst_addr) = out29;

678

dst_addr += dst_plane_stride;

679

680

// Row5

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

681

VEC_DATA_TYPE(DATA_TYPE, 4)

682

d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

683

VEC_DATA_TYPE(DATA_TYPE, 2)

684

d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

685

686

// Channels [30, 35]

687

out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

688

out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

689

out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

690

out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

691

out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

692

out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;

693

694

*(dst_addr) = out0;

695

dst_addr += dst_plane_stride;

696

*(dst_addr) = out1;

697

dst_addr += dst_plane_stride;

698

*(dst_addr) = out2;

699

dst_addr += dst_plane_stride;

700

*(dst_addr) = out3;

701

dst_addr += dst_plane_stride;

702

*(dst_addr) = out4;

703

dst_addr += dst_plane_stride;

704

*(dst_addr) = out5;

705

dst_addr += dst_plane_stride;

706

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

707

}

708

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

709

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW

710

*

711

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

712

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

713

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

714

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

715

* @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

716

* @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

717

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

718

*

719

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

720

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

721

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

722

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

723

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

724

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

725

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

726

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

727

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

728

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

729

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

730

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

731

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

732

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

733

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

734

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

735

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

736

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

737

*/

738

__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(

739

TENSOR3D_DECLARATION(src),

740

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int x = get_global_id(0);

745

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

746

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

747

const int z = get_global_id(2) % SRC_DEPTH;

748

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

749

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

750

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

751

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

752

753

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

754

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

755

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

756

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

757

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

758

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

759

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

760

761

// Load input tile

762

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

763

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));

764

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

765

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

766

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

767

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

768

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),

769

*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

770

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),

771

*((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),

772

*((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));

773

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

774

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

775

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

776

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

777

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

778

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

779

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

780

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));

781

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));

782

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

783

784

// Calculate common factors for intermediate tensor

785

VEC_DATA_TYPE(DATA_TYPE, 8)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

786

tmp0 = in_row0;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

787

VEC_DATA_TYPE(DATA_TYPE, 8)

788

comm_fact0 = 0.0f;

789

790

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

791

comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

792

tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

793

794

VEC_DATA_TYPE(DATA_TYPE, 8)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

795

comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

796

VEC_DATA_TYPE(DATA_TYPE, 8)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

797

comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

798

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

799

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;

800

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

801

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

802

comm_fact0 = (DATA_TYPE)2.5f * in_row3;

803

comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

804

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

805

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;

806

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

807

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

808

comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;

809

comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

810

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

811

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;

812

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;

813

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

814

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

815

816

// Calculate output rows (reuse comm_fact0 vector)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

817

VEC_DATA_TYPE(DATA_TYPE, 8)

818

out0;

819

820

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

821

822

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

823

VEC_DATA_TYPE(DATA_TYPE, 8)

824

out1, out2, out3, out4, out5, out6, out7;

825

826

OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);

827

OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);

828

OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);

829

OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);

830

OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);

831

OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);

832

OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

833

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

834

835

// Store values across the channels

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

836

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

837

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

838

#else /* defined(SRC_DEPTH) */

839

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

840

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

841

842

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;

843

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;

844

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;

845

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;

846

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;

847

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;

848

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;

849

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;

850

851

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

852

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;

853

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;

854

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;

855

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;

856

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;

857

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;

858

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;

859

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;

860

*((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;

861

*((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;

862

*((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;

863

*((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;

864

*((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;

865

*((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;

866

*((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;

867

*((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;

868

*((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;

869

*((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;

870

*((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;

871

*((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;

872

*((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;

873

*((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;

874

*((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;

875

*((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;

876

*((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;

877

*((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;

878

*((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;

879

*((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;

880

*((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;

881

*((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;

882

*((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;

883

*((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;

884

*((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;

885

*((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;

886

*((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;

887

*((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;

888

*((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;

889

*((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;

890

*((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;

891

*((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;

892

*((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;

893

*((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;

894

*((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;

895

*((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;

896

*((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;

897

*((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;

898

*((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;

899

*((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;

900

*((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;

901

*((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;

902

*((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;

903

*((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;

904

*((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;

905

*((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;

906

*((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;

907

*((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;

908

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

909

}

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

910

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

911

#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)

912

//! @cond Doxygen_Suppress

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

913

/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

914

*

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

915

* @note Data layout supported: NHWC

916

* @note Data type supported: F32/F16

917

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

918

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

919

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

920

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

921

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

922

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

923

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

924

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

925

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

926

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

927

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

928

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

929

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

930

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

931

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

932

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

933

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

934

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

935

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

936

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

937

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

938

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

939

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

940

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

941

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

942

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

943

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

944

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

945

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

946

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

947

//! @endcond

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

948

__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

949

TENSOR4D(src, BUFFER),

950

TENSOR4D(dst, BUFFER))

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

951

{

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

952

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

953

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

954

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

955

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

956

// All the tensor dimensions are passed at compile time.

957

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

958

#define _ISRC_WIDTH SRC_WIDTH

959

#define _ISRC_HEIGHT SRC_HEIGHT

960

#define _INUM_TILES_X NUM_TILES_X

961

#define _INUM_TILES_Y NUM_TILES_Y

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

962

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

963

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

964

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

965

x -= PAD_LEFT;

966

y -= PAD_TOP;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

967

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

968

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

969

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

970

TILE(DATA_TYPE, 6, 1, in);

971

TILE(DATA_TYPE, 6, 1, out);

972

973

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

974

LOOP_UNROLLING(int, i, 0, 1, 6,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

975

{

976

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

977

})

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

978

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

979

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

980

T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

981

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

982

T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

983

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

984

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

985

TILE(DATA_TYPE, 6, 1, com);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

986

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

987

LOOP_UNROLLING(int, i, 0, 1, 6,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

988

{

989

in[i].v *= 4.0f;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

990

})

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

991

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

992

com[0].v = in[2].v - 4.f * in[0].v;

993

com[1].v = in[3].v - 4.f * in[1].v;

994

com[2].v = in[4].v - 4.f * in[2].v;

995

com[3].v = in[5].v - 4.f * in[3].v;

996

com[4].v = in[3].v - in[1].v;

997

com[4].v = com[4].v + com[4].v;

998

com[5].v = in[4].v - in[2].v;

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

999

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1000

out[0].v = com[2].v - com[0].v;

1001

out[1].v = com[2].v + com[1].v;

1002

out[2].v = com[2].v - com[1].v;

1003

out[3].v = com[5].v + com[4].v;

1004

out[4].v = com[5].v - com[4].v;

1005

out[5].v = com[3].v - com[1].v;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1006

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1007

TILE(uint, 6, 1, dst_indirect_y);

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1008

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1009

LOOP_UNROLLING(int, i, 0, 1, 6,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1010

{

1011

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1012

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1013

})

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1014

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1015

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1016

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1017

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1018

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1019

TILE(DATA_TYPE, 36, 1, in);

1020

1021

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1022

LOOP_UNROLLING(int, i, 0, 1, 36,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1023

{

1024

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1025

})

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1026

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1027

// Load the tile from a NHWC tensor

1028

T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1029

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1030

TILE(DATA_TYPE, 6, 1, com);

1031

TILE(DATA_TYPE, 36, 1, tmp);

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1032

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1033

LOOP_UNROLLING(int, i, 0, 1, 6,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1034

{

1035

com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;

1036

com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;

1037

com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;

1038

com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;

1039

com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;

1040

com[4].v = com[4].v + com[4].v;

1041

com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;

1042

tmp[i + 0 * 6].v = com[2].v - com[0].v;

1043

tmp[i + 1 * 6].v = com[2].v + com[1].v;

1044

tmp[i + 2 * 6].v = com[2].v - com[1].v;

1045

tmp[i + 3 * 6].v = com[5].v + com[4].v;

1046

tmp[i + 4 * 6].v = com[5].v - com[4].v;

1047

tmp[i + 5 * 6].v = com[3].v - com[1].v;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1048

})

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1049

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1050

TILE(DATA_TYPE, 36, 1, out);

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

1051

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1052

LOOP_UNROLLING(int, i, 0, 1, 6,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1053

{

1054

com[0].v = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;

1055

com[1].v = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;

1056

com[2].v = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;

1057

com[3].v = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;

1058

com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;

1059

com[4].v = com[4].v + com[4].v;

1060

com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;

1061

out[i * 6 + 0].v = com[2].v - com[0].v;

1062

out[i * 6 + 1].v = com[2].v + com[1].v;

1063

out[i * 6 + 2].v = com[2].v - com[1].v;

1064

out[i * 6 + 3].v = com[5].v + com[4].v;

1065

out[i * 6 + 4].v = com[5].v - com[4].v;

1066

out[i * 6 + 5].v = com[3].v - com[1].v;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1067

})

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1068

1069

// Compute destination address

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1070

TILE(uint, 36, 1, dst_indirect_y);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1071

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1072

LOOP_UNROLLING(int, i, 0, 1, 36,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1073

{

1074

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1075

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1076

})

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1077

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1078

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

1079

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1080

}

1081

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1082

//! @cond Doxygen_Suppress

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1083

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1084

*

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1085

* @note Data layout supported: NHWC

1086

* @note Data type supported: F32/F16

1087

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1088

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1089

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1090

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1091

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1092

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1093

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1094

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1095

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1096

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1097

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1098

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1099

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1100

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1101

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1102

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1103

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1104

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1105

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1106

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1107

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1108

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1109

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1110

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1111

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1112

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1113

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1114

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1115

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1116

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1117

//! @endcond

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1118

__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1119

TENSOR4D(src, BUFFER),

1120

TENSOR4D(dst, BUFFER))

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1121

{

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1122

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

1123

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

1124

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1125

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1126

// All the tensor dimensions are passed at compile time.

1127

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

1128

#define _ISRC_WIDTH SRC_WIDTH

1129

#define _ISRC_HEIGHT SRC_HEIGHT

1130

#define _INUM_TILES_X NUM_TILES_X

1131

#define _INUM_TILES_Y NUM_TILES_Y

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1132

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1133

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

1134

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

1135

x -= PAD_LEFT;

1136

y -= PAD_TOP;

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1137

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1138

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1139

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1140

TILE(DATA_TYPE, 8, 1, in);

1141

TILE(DATA_TYPE, 8, 1, out);

1142

1143

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1144

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1145

{

1146

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1147

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1148

1149

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1150

T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

1151

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1152

T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

1153

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1154

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1155

TILE(DATA_TYPE, 1, 8, com);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1156

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1157

com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;

1158

com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;

1159

com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;

1160

com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;

1161

com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;

1162

com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;

1163

out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;

1164

out[1].s[0] = com[0].s[0] + com[0].s[1];

1165

out[2].s[0] = com[0].s[0] - com[0].s[1];

1166

out[3].s[0] = com[0].s[3] + com[0].s[2];

1167

out[4].s[0] = com[0].s[3] - com[0].s[2];

1168

out[5].s[0] = com[0].s[4] + com[0].s[5];

1169

out[6].s[0] = com[0].s[4] - com[0].s[5];

1170

out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1171

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1172

TILE(uint, 8, 1, dst_indirect_y);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1173

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1174

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1175

{

1176

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1177

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1178

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1179

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1180

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1181

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1182

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1183

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1184

TILE(DATA_TYPE, 64, 1, in);

1185

TILE(DATA_TYPE, 64, 1, out);

1186

1187

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1188

LOOP_UNROLLING(int, i, 0, 1, 64,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1189

{

1190

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1191

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1192

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1193

// Load the tile from a NHWC tensor

1194

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1195

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1196

TILE(DATA_TYPE, 8, 8, com);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1197

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1198

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1199

{

1200

com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

1201

com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x

1202

com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x

1203

com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x

1204

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

1205

com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];

1206

com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];

1207

com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1208

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1209

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1210

TILE(DATA_TYPE, 8, 8, tmp);

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1211

tmp[0].v = com[6].v;

1212

tmp[1].v = com[0].v + com[1].v;

1213

tmp[2].v = com[0].v - com[1].v;

1214

tmp[3].v = com[2].v + com[3].v;

1215

tmp[4].v = com[2].v - com[3].v;

1216

tmp[5].v = com[4].v + com[5].v;

1217

tmp[6].v = com[4].v - com[5].v;

1218

tmp[7].v = com[7].v;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1219

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1220

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1221

{

1222

com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];

1223

com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];

1224

com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];

1225

com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];

1226

com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];

1227

com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];

1228

out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];

1229

out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];

1230

out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];

1231

out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];

1232

out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];

1233

out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];

1234

out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];

1235

out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1236

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1237

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1238

TILE(uint, 64, 1, dst_indirect_y);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1239

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1240

LOOP_UNROLLING(int, i, 0, 1, 64,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1241

{

1242

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1243

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1244

})

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1245

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1246

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1247

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1248

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1249

}

1250

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1251

//! @cond Doxygen_Suppress

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1252

/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC

1253

*

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1254

* @note Data layout supported: NHWC

1255

* @note Data type supported: F32/F16

1256

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1257

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1258

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1259

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1260

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1261

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1262

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1263

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1264

*

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1265

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1266

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1267

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1268

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1269

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1270

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1271

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1272

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1273

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1274

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1275

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1276

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1277

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1278

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1279

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1280

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1281

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1282

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1283

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1284

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1285

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1286

//! @endcond

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1287

__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1288

TENSOR4D(src, BUFFER),

1289

TENSOR4D(dst, BUFFER))

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1290

{

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1291

const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM

1292

const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y

1293

const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1294

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1295

// All the tensor dimensions are passed at compile time.

1296

// In case of dynamic tensor support, the following dimensions should be passed as function argument.

1297

#define _ISRC_WIDTH SRC_WIDTH

1298

#define _ISRC_HEIGHT SRC_HEIGHT

1299

#define _INUM_TILES_X NUM_TILES_X

1300

#define _INUM_TILES_Y NUM_TILES_Y

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1301

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1302

int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;

1303

int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;

1304

x -= PAD_LEFT;

1305

y -= PAD_TOP;

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1306

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1307

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1308

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1309

TILE(DATA_TYPE, 8, 1, in);

1310

TILE(DATA_TYPE, 8, 1, out);

1311

1312

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1313

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1314

{

1315

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1316

})

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1317

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1318

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1319

T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

1320

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1321

T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

1322

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1323

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1324

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1325

{

1326

in[i].v *= (DATA_TYPE) - 36.0f;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1327

})

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1328

Sheri Zhang

6dbcc0e

2021-04-12 10:53:57 +0100

[diff] [blame]

1329

TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1330

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1331

com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;

1332

com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;

1333

com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;

1334

com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;

1335

com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;

1336

com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;

1337

out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;

1338

out[1].s[0] = com[0].s[0] - com[0].s[1];

1339

out[2].s[0] = com[0].s[0] + com[0].s[1];

1340

out[3].s[0] = com[0].s[2] - com[0].s[3];

1341

out[4].s[0] = com[0].s[2] + com[0].s[3];

1342

out[5].s[0] = com[0].s[4] - com[0].s[5];

1343

out[6].s[0] = com[0].s[4] + com[0].s[5];

1344

out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1345

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1346

TILE(uint, 8, 1, dst_indirect_y);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1347

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1348

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1349

{

1350

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1351

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1352

})

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1353

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1354

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1355

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1356

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2020-10-19 12:49:44 +0100

[diff] [blame]

1357

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1358

TILE(DATA_TYPE, 64, 1, in);

1359

TILE(DATA_TYPE, 64, 1, out);

1360

1361

// Initialize the input tile

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1362

LOOP_UNROLLING(int, i, 0, 1, 64,

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1363

{

1364

in[i].v = 0;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1365

})

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1366

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1367

// Load the tile from a NHWC tensor

1368

T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1369

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1370

TILE(DATA_TYPE, 8, 8, com);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1371

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1372

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1373

{

1374

com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

1375

com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];

1376

com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

1377

com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];

1378

com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];

1379

com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];

1380

com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];

1381

com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1382

})

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1383

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1384

TILE(DATA_TYPE, 8, 8, tmp);

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1385

tmp[0].v = com[6].v;

1386

tmp[1].v = com[0].v - com[1].v;

1387

tmp[2].v = com[0].v + com[1].v;

1388

tmp[3].v = com[2].v - com[3].v;

1389

tmp[4].v = com[2].v + com[3].v;

1390

tmp[5].v = com[4].v - com[5].v;

1391

tmp[6].v = com[4].v + com[5].v;

1392

tmp[7].v = com[7].v;

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1393

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1394

LOOP_UNROLLING(int, i, 0, 1, 8,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1395

{

1396

com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];

1397

com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];

1398

com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];

1399

com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];

1400

com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];

1401

com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];

1402

out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];

1403

out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];

1404

out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];

1405

out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];

1406

out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];

1407

out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];

1408

out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];

1409

out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1410

})

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1411

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1412

TILE(uint, 64, 1, dst_indirect_y);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1413

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1414

LOOP_UNROLLING(int, i, 0, 1, 64,

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1415

{

1416

dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;

1417

dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;

Giorgio Arena

2021-05-13 16:58:51 +0100

[diff] [blame]

1418

})

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1419

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1420

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

1421

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1422

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1423

}

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1424

1425

//! @cond Doxygen_Suppress

1426

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC

1427

*

1428

* @note Data layout supported: NHWC

1429

* @note Data type supported: F32/F16

1430

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1431

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1432

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1433

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1434

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1435

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1436

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1437

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1438

*

1439

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1440

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1441

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1442

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1443

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1444

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1445

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1446

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1447

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1448

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1449

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1450

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1451

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1452

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1453

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1454

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1455

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1456

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1457

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1458

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1459

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1460

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1461

__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(

1462

TENSOR4D(src, BUFFER),

1463

TENSOR4D(dst, BUFFER))

1464

{

1465

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1485

}

1486

1487

//! @cond Doxygen_Suppress

1488

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC

1489

*

1490

* @note Data layout supported: NHWC

1491

* @note Data type supported: F32/F16

1492

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1493

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1494

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1495

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1496

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1497

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1498

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1499

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1500

*

1501

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1502

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1503

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1504

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1505

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1506

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1507

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1508

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1509

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1510

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1511

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1512

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1513

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1514

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1515

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1516

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1517

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1518

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1519

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1520

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1521

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1522

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1523

__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(

1524

TENSOR4D(src, BUFFER),

1525

TENSOR4D(dst, BUFFER))

1526

{

1527

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1547

}

1548

1549

//! @cond Doxygen_Suppress

1550

/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC

1551

*

1552

* @note Data layout supported: NHWC

1553

* @note Data type supported: F32/F16

1554

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1555

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1556

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1557

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1558

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1559

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1560

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1561

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1562

*

1563

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1564

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1565

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1566

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1567

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1568

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1569

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1570

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1571

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1572

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1573

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1574

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1575

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1576

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1577

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1578

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1579

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1580

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1581

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1582

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1583

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1584

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1585

__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(

1586

TENSOR4D(src, BUFFER),

1587

TENSOR4D(dst, BUFFER))

1588

{

1589

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1609

}

1610

1611

//! @cond Doxygen_Suppress

1612

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC

1613

*

1614

* @note Data layout supported: NHWC

1615

* @note Data type supported: F32/F16

1616

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1617

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1618

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1619

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1620

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1621

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1622

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1623

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1624

*

1625

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1626

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1627

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1628

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1629

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1630

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1631

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1632

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1633

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1634

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1635

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1636

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1637

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1638

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1639

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1640

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1641

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1642

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1643

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1644

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1645

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1646

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1647

__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(

1648

TENSOR4D(src, BUFFER),

1649

TENSOR4D(dst, BUFFER))

1650

{

1651

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1671

}

1672

1673

//! @cond Doxygen_Suppress

1674

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC

1675

*

1676

* @note Data layout supported: NHWC

1677

* @note Data type supported: F32/F16

1678

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1679

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1680

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1681

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1682

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1683

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1684

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1685

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1686

*

1687

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1688

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1689

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1690

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1691

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1692

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1693

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1694

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1695

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1696

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1697

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1698

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1699

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1700

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1701

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1702

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1703

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1704

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1705

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1706

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1707

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1708

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1709

__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(

1710

TENSOR4D(src, BUFFER),

1711

TENSOR4D(dst, BUFFER))

1712

{

1713

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1733

}

1734

1735

//! @cond Doxygen_Suppress

1736

/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC

1737

*

1738

* @note Data layout supported: NHWC

1739

* @note Data type supported: F32/F16

1740

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)

1741

* @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).

1742

* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)

1743

* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)

1744

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1745

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1746

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1747

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1748

*

1749

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

1750

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1751

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1752

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1753

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1754

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1755

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1756

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1757

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1758

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1759

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1760

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1761

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1762

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1763

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1764

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1765

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

1766

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1767

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1768

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1769

*/

Gian Marco Iodice

2021-04-13 15:53:20 +0100

[diff] [blame]

1770

//! @endcond

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

1771

__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(

1772

TENSOR4D(src, BUFFER),

1773

TENSOR4D(dst, BUFFER))

1774

{

1775

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_stride_w,

src_step_w,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_stride_w,

dst_step_w,

dst_offset_first_element_in_bytes);

1795

}

1796

#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1797

1798

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1799

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1

1800

*

1801

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1802

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1803

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1804

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1805

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1806

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1807

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1808

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1809

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1810

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1811

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1812

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1813

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1814

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1815

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1816

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1817

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1818

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1819

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1820

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1821

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1822

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1823

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1824

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1825

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1826

*/

1827

__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(

1828

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1829

TENSOR3D_DECLARATION(dst),

1830

uint src_stride_w,

1831

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1832

{

1833

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1848

dst_offset_first_element_in_bytes,

1849

src_stride_w,

1850

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1851

}

1852

1853

/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2

1854

*

1855

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1856

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1857

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1858

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1859

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1860

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1861

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1862

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1863

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1864

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1865

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1866

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1867

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1868

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1869

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1870

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1871

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1872

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1873

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1874

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1875

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1876

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1877

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1878

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1879

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1880

*/

1881

__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(

1882

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1883

TENSOR3D_DECLARATION(dst),

1884

uint src_stride_w,

1885

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1886

{

1887

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1902

dst_offset_first_element_in_bytes,

1903

src_stride_w,

1904

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1905

}

1906

1907

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1

1908

*

1909

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1910

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1911

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1912

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1913

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1914

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1915

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1916

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1917

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1918

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1919

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1920

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1921

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1922

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1923

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1924

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1925

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1926

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1927

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1928

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1929

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1930

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1931

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1932

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1933

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1934

*/

1935

__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(

1936

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1937

TENSOR3D_DECLARATION(dst),

1938

uint src_stride_w,

1939

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1940

{

1941

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1956

dst_offset_first_element_in_bytes,

1957

src_stride_w,

1958

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1959

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1960

1961

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW

1962

*

1963

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1964

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1965

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1966

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1967

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1968

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1969

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1970

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1971

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1972

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1973

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1974

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1975

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1976

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1977

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1978

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1979

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1980

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1981

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1982

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1983

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1984

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1985

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1986

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1987

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1988

*/

1989

__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(

1990

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1991

TENSOR3D_DECLARATION(dst),

1992

uint src_stride_w,

1993

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1994

{

1995

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2010

dst_offset_first_element_in_bytes,

2011

src_stride_w,

2012

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2013

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2014

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

2015

2016

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

2017

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2

2018

*

2019

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2020

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2021

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2022

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

2023

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2024

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2025

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2026

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2027

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2028

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2029

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2030

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2031

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2032

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2033

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2034

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2035

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2036

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2037

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2038

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2039

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2040

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2041

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2042

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2043

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2044

*/

2045

__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(

2046

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2047

TENSOR3D_DECLARATION(dst),

2048

uint src_stride_w,

2049

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2050

{

2051

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2066

dst_offset_first_element_in_bytes,

2067

src_stride_w,

2068

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2069

}

2070

2071

/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2

2072

*

2073

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2074

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2075

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2076

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

2077

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2078

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2079

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2080

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2081

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2082

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2083

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2084

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2085

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2086

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2087

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2088

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2089

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2090

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2091

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2092

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2093

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2094

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2095

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2096

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2097

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2098

*/

2099

__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(

2100

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2101

TENSOR3D_DECLARATION(dst),

2102

uint src_stride_w,

2103

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2104

{

2105

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2120

dst_offset_first_element_in_bytes,

2121

src_stride_w,

2122

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2123

}

2124

2125

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4

2126

*

2127

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2128

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2129

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2130

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2131

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2132

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2133

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2134

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2135

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2136

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2137

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2138

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2139

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2140

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2141

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2142

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2143

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2144

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2145

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2146

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2147

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2148

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2149

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2150

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2151

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2152

*/

2153

__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(

2154

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2155

TENSOR3D_DECLARATION(dst),

2156

uint src_stride_w,

2157

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2158

{

2159

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2174

dst_offset_first_element_in_bytes,

2175

src_stride_w,

2176

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2177

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2178

2179

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4

2180

*

2181

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2182

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2183

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2184

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2185

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2186

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2187

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2188

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2189

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2190

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2191

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2192

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2193

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2194

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2195

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2196

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2197

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2198

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2199

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2200

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2201

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2202

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2203

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2204

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2205

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2206

*/

2207

__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(

2208

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2209

TENSOR3D_DECLARATION(dst),

2210

uint src_stride_w,

2211

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2212

{

2213

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2228

dst_offset_first_element_in_bytes,

2229

src_stride_w,

2230

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2231

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2232

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Michele Di Giorgio