Blame - src/core/CL/cl_kernels/nchw/winograd_input_transform.cl - ml/ComputeLibrary

2018-10-29 18:01:52 +0000

[diff] [blame]

113

#else /* defined(SRC_DEPTH) */

114

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

115

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

116

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

117

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

118

119

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

120

VEC_DATA_TYPE(DATA_TYPE, 4)

121

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

122

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

123

VEC_DATA_TYPE(DATA_TYPE, 4)

124

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

125

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

126

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

127

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

128

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

129

VEC_DATA_TYPE(DATA_TYPE, 4)

130

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

131

VEC_DATA_TYPE(DATA_TYPE, 4)

132

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

133

VEC_DATA_TYPE(DATA_TYPE, 4)

134

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

135

VEC_DATA_TYPE(DATA_TYPE, 4)

136

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

137

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

138

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

139

VEC_DATA_TYPE(DATA_TYPE, 4)

140

tmp0 = in_row0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

141

142

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

143

tmp0 -= in_row2;

144

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

145

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

146

DATA_TYPE out00 = tmp0.s0 - tmp0.s2;

147

DATA_TYPE out01 = tmp0.s1 + tmp0.s2;

148

DATA_TYPE out02 = tmp0.s2 - tmp0.s1;

149

DATA_TYPE out03 = tmp0.s1 - tmp0.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

150

151

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

152

VEC_DATA_TYPE(DATA_TYPE, 4)

153

tmp1 = in_row1 + in_row2;

154

VEC_DATA_TYPE(DATA_TYPE, 4)

155

tmp2 = in_row2 - in_row1;

156

VEC_DATA_TYPE(DATA_TYPE, 4)

157

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

158

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

159

DATA_TYPE out10 = tmp1.s0 - tmp1.s2;

160

DATA_TYPE out11 = tmp1.s1 + tmp1.s2;

161

DATA_TYPE out12 = tmp1.s2 - tmp1.s1;

162

DATA_TYPE out13 = tmp1.s1 - tmp1.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

163

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

164

DATA_TYPE out20 = tmp2.s0 - tmp2.s2;

165

DATA_TYPE out21 = tmp2.s1 + tmp2.s2;

166

DATA_TYPE out22 = tmp2.s2 - tmp2.s1;

167

DATA_TYPE out23 = tmp2.s1 - tmp2.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

168

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

169

DATA_TYPE out30 = tmp3.s0 - tmp3.s2;

170

DATA_TYPE out31 = tmp3.s1 + tmp3.s2;

171

DATA_TYPE out32 = tmp3.s2 - tmp3.s1;

172

DATA_TYPE out33 = tmp3.s1 - tmp3.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

173

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

174

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

175

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

176

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

177

#else /* defined(SRC_DEPTH) */

178

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

179

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

180

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

181

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;

182

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;

183

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;

184

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

185

186

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

187

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;

188

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;

189

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;

190

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;

191

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;

192

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;

193

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;

194

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;

195

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;

196

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;

197

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;

198

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

199

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

200

}

201

202

/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2

203

*

204

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

205

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

206

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

207

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

208

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

209

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

210

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

211

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

212

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

213

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

214

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

215

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

216

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

217

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

218

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

219

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

220

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

221

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

222

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

223

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

224

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

225

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

226

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

228

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

229

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

230

*/

231

__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(

232

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

233

TENSOR3D_DECLARATION(dst),

234

uint src_stride_w,

235

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

236

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

237

const int x = get_global_id(0);

238

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

239

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

240

const int z = (get_global_id(2) * 2) % SRC_DEPTH;

241

const int b = (get_global_id(2) * 2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

242

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

243

const int z = get_global_id(2) * 2;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

244

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

245

246

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

247

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

248

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

249

#else /* defined(SRC_DEPTH) */

250

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

251

#endif /* defined(SRC_DEPTH) */

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

252

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

253

254

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

255

VEC_DATA_TYPE(DATA_TYPE, 4)

256

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

257

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

258

VEC_DATA_TYPE(DATA_TYPE, 4)

259

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

260

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

261

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

262

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

263

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

264

VEC_DATA_TYPE(DATA_TYPE, 4)

265

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

266

VEC_DATA_TYPE(DATA_TYPE, 4)

267

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

268

VEC_DATA_TYPE(DATA_TYPE, 4)

269

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

270

VEC_DATA_TYPE(DATA_TYPE, 4)

271

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

272

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

273

274

src_addr += src_stride_z;

275

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

276

VEC_DATA_TYPE(DATA_TYPE, 4)

277

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

278

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

279

VEC_DATA_TYPE(DATA_TYPE, 4)

280

in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

281

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

282

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

283

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

284

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

285

VEC_DATA_TYPE(DATA_TYPE, 4)

286

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

287

VEC_DATA_TYPE(DATA_TYPE, 4)

288

in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

289

VEC_DATA_TYPE(DATA_TYPE, 4)

290

in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

291

VEC_DATA_TYPE(DATA_TYPE, 4)

292

in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

293

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

294

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

295

VEC_DATA_TYPE(DATA_TYPE, 4)

296

tmp0 = in_row0;

297

VEC_DATA_TYPE(DATA_TYPE, 4)

298

tmp4 = in_row4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

299

300

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

301

tmp0 -= in_row2;

302

tmp4 -= in_row6;

303

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

304

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

305

VEC_DATA_TYPE(DATA_TYPE, 2)

306

out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);

307

VEC_DATA_TYPE(DATA_TYPE, 2)

308

out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);

309

VEC_DATA_TYPE(DATA_TYPE, 2)

310

out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);

311

VEC_DATA_TYPE(DATA_TYPE, 2)

312

out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

313

314

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

315

VEC_DATA_TYPE(DATA_TYPE, 4)

316

tmp1 = in_row1 + in_row2;

317

VEC_DATA_TYPE(DATA_TYPE, 4)

318

tmp2 = in_row2 - in_row1;

319

VEC_DATA_TYPE(DATA_TYPE, 4)

320

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

321

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

322

VEC_DATA_TYPE(DATA_TYPE, 4)

323

tmp5 = in_row5 + in_row6;

324

VEC_DATA_TYPE(DATA_TYPE, 4)

325

tmp6 = in_row6 - in_row5;

326

VEC_DATA_TYPE(DATA_TYPE, 4)

327

tmp7 = in_row5 - in_row7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

328

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

329

VEC_DATA_TYPE(DATA_TYPE, 2)

330

out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);

331

VEC_DATA_TYPE(DATA_TYPE, 2)

332

out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);

333

VEC_DATA_TYPE(DATA_TYPE, 2)

334

out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);

335

VEC_DATA_TYPE(DATA_TYPE, 2)

336

out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

337

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

338

VEC_DATA_TYPE(DATA_TYPE, 2)

339

out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);

340

VEC_DATA_TYPE(DATA_TYPE, 2)

341

out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);

342

VEC_DATA_TYPE(DATA_TYPE, 2)

343

out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);

344

VEC_DATA_TYPE(DATA_TYPE, 2)

345

out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

346

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

347

VEC_DATA_TYPE(DATA_TYPE, 2)

348

out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);

349

VEC_DATA_TYPE(DATA_TYPE, 2)

350

out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);

351

VEC_DATA_TYPE(DATA_TYPE, 2)

352

out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);

353

VEC_DATA_TYPE(DATA_TYPE, 2)

354

out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

356

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

357

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

358

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

359

#else /* defined(SRC_DEPTH) */

360

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

361

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

362

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

363

vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));

364

vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));

365

vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));

366

vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

367

368

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

369

vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));

370

vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));

371

vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));

372

vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));

373

vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));

374

vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));

375

vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));

376

vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));

377

vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));

378

vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));

379

vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));

380

vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

381

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

382

}

383

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

384

/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

385

*

386

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

387

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

388

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

389

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

390

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

391

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

392

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

393

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

394

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

395

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

396

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

397

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

398

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

399

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

400

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

401

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

402

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

403

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

404

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

405

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

406

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

407

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

408

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

409

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

410

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

411

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

412

*/

413

__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(

414

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

415

TENSOR3D_DECLARATION(dst),

416

uint src_stride_w,

417

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

418

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

419

const int x = get_global_id(0);

420

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

421

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

422

const int z = get_global_id(2) % SRC_DEPTH;

423

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

424

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

425

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

426

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

427

428

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

429

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

430

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

431

#else /* defined(SRC_DEPTH) */

432

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

433

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

434

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

435

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

436

437

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

438

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

439

VEC_DATA_TYPE(DATA_TYPE, 4)

440

d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

441

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

442

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

443

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

444

VEC_DATA_TYPE(DATA_TYPE, 2)

445

d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

446

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

447

#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

448

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

449

VEC_DATA_TYPE(DATA_TYPE, 4)

450

d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

451

VEC_DATA_TYPE(DATA_TYPE, 2)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

452

d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

453

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

454

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

455

DATA_TYPE out0 = 0.0f;

456

DATA_TYPE out1 = 0.0f;

457

DATA_TYPE out2 = 0.0f;

458

DATA_TYPE out3 = 0.0f;

459

DATA_TYPE out4 = 0.0f;

460

DATA_TYPE out5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

461

462

// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]

463

out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;

464

out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;

465

out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;

466

out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;

467

out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;

468

out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;

469

470

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

471

// Row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

472

VEC_DATA_TYPE(DATA_TYPE, 4)

473

d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

474

VEC_DATA_TYPE(DATA_TYPE, 2)

475

d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

476

477

// k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

478

DATA_TYPE k0 = d41.s0;

479

DATA_TYPE k1 = d41.s0;

480

DATA_TYPE k2 = d41.s0;

481

DATA_TYPE k3 = d41.s0;

482

DATA_TYPE k4 = d41.s0;

483

DATA_TYPE k5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

484

485

k0 += 4.0f * d40.s0 - 5.0f * d40.s2;

486

k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;

487

k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;

488

k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;

489

k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;

490

k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;

out0 += k0;

out1 += k1;

out2 += k2;

out3 += k3;

out4 += k4;

out5 += k5;

// Row2

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

500

VEC_DATA_TYPE(DATA_TYPE, 4)

501

d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

502

VEC_DATA_TYPE(DATA_TYPE, 2)

503

d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

504

505

out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;

506

out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;

507

out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;

508

out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;

509

out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;

510

out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;

511

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

512

513

// Compute destination address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

514

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

515

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

516

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

517

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

518

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

519

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

520

uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

521

522

*(dst_addr) = out0;

523

dst_addr += dst_plane_stride;

524

*(dst_addr) = out1;

525

dst_addr += dst_plane_stride;

526

*(dst_addr) = out2;

527

dst_addr += dst_plane_stride;

528

*(dst_addr) = out3;

529

dst_addr += dst_plane_stride;

530

*(dst_addr) = out4;

531

dst_addr += dst_plane_stride;

532

*(dst_addr) = out5;

533

dst_addr += dst_plane_stride;

534

535

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

DATA_TYPE out6 = k0;

DATA_TYPE out7 = k1;

DATA_TYPE out8 = k2;

DATA_TYPE out9 = k3;

DATA_TYPE out10 = k4;

541

DATA_TYPE out11 = k5;

542

DATA_TYPE out12 = k0;

543

DATA_TYPE out13 = k1;

544

DATA_TYPE out14 = k2;

545

DATA_TYPE out15 = k3;

546

DATA_TYPE out16 = k4;

547

DATA_TYPE out17 = k5;

548

DATA_TYPE out18 = k0;

549

DATA_TYPE out19 = k1;

550

DATA_TYPE out20 = k2;

551

DATA_TYPE out21 = k3;

552

DATA_TYPE out22 = k4;

553

DATA_TYPE out23 = k5;

554

DATA_TYPE out24 = k0;

555

DATA_TYPE out25 = k1;

556

DATA_TYPE out26 = k2;

557

DATA_TYPE out27 = k3;

558

DATA_TYPE out28 = k4;

559

DATA_TYPE out29 = k5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

560

561

// Row1

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

562

VEC_DATA_TYPE(DATA_TYPE, 4)

563

d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

564

VEC_DATA_TYPE(DATA_TYPE, 2)

565

d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

566

567

// Row3

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

568

VEC_DATA_TYPE(DATA_TYPE, 4)

569

d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

570

VEC_DATA_TYPE(DATA_TYPE, 2)

571

d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

572

573

// Compute common parts for the channels between [6, 29]

574

// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]

575

// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

576

DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;

577

DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;

578

DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;

579

DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;

580

DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;

581

DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;

582

DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;

583

DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;

584

DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;

585

DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;

586

DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;

587

DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

588

589

// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]

590

// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

591

DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;

592

DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0

593

DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0

594

DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;

595

DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;

596

DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3

597

DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0

598

DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;

599

DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;

600

DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)

601

DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1

602

DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

603

604

out6 += part0 - part1;

605

out12 += part0 + part1;

606

out7 += part2 + part3 + part4 + part5;

607

out8 += part2 - part3 + part4 - part5;

608

out13 += part2 + part3 - part4 - part5;

609

out14 += part2 - part3 - part4 + part5;

610

out9 += part6 + part7 + part8 + part9;

611

out10 += part6 - part7 + part8 - part9;

612

out15 += part6 - part7 - part8 + part9;

613

out16 += part6 + part7 - part8 - part9;

614

out11 += part10 + part11;

615

out17 += part10 - part11;

616

617

out18 += part13 - part12;

618

out24 += part13 + part12;

619

out19 += part14 + part15 + part16 + part17;

620

out20 += part14 - part15 + part16 - part17;

621

out25 += part14 - part15 - part16 + part17;

622

out26 += part14 + part15 - part16 - part17;

623

out21 += part18 + part19 + part20 + part21;

624

out22 += part18 - part19 + part20 - part21;

625

out27 += part18 - part19 - part20 + part21;

626

out28 += part18 + part19 - part20 - part21;

627

out23 += part22 + part23;

628

out29 += part22 - part23;

629

630

*(dst_addr) = out6;

631

dst_addr += dst_plane_stride;

632

*(dst_addr) = out7;

633

dst_addr += dst_plane_stride;

634

*(dst_addr) = out8;

635

dst_addr += dst_plane_stride;

636

*(dst_addr) = out9;

637

dst_addr += dst_plane_stride;

638

*(dst_addr) = out10;

639

dst_addr += dst_plane_stride;

640

*(dst_addr) = out11;

641

dst_addr += dst_plane_stride;

642

*(dst_addr) = out12;

643

dst_addr += dst_plane_stride;

644

*(dst_addr) = out13;

645

dst_addr += dst_plane_stride;

646

*(dst_addr) = out14;

647

dst_addr += dst_plane_stride;

648

*(dst_addr) = out15;

649

dst_addr += dst_plane_stride;

650

*(dst_addr) = out16;

651

dst_addr += dst_plane_stride;

652

*(dst_addr) = out17;

653

dst_addr += dst_plane_stride;

654

655

*(dst_addr) = out18;

656

dst_addr += dst_plane_stride;

657

*(dst_addr) = out19;

658

dst_addr += dst_plane_stride;

659

*(dst_addr) = out20;

660

dst_addr += dst_plane_stride;

661

*(dst_addr) = out21;

662

dst_addr += dst_plane_stride;

663

*(dst_addr) = out22;

664

dst_addr += dst_plane_stride;

665

*(dst_addr) = out23;

666

dst_addr += dst_plane_stride;

667

*(dst_addr) = out24;

668

dst_addr += dst_plane_stride;

669

*(dst_addr) = out25;

670

dst_addr += dst_plane_stride;

671

*(dst_addr) = out26;

672

dst_addr += dst_plane_stride;

673

*(dst_addr) = out27;

674

dst_addr += dst_plane_stride;

675

*(dst_addr) = out28;

676

dst_addr += dst_plane_stride;

677

*(dst_addr) = out29;

678

dst_addr += dst_plane_stride;

679

680

// Row5

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

681

VEC_DATA_TYPE(DATA_TYPE, 4)

682

d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

683

VEC_DATA_TYPE(DATA_TYPE, 2)

684

d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

685

686

// Channels [30, 35]

687

out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

688

out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

689

out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

690

out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

691

out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

692

out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;

693

694

*(dst_addr) = out0;

695

dst_addr += dst_plane_stride;

696

*(dst_addr) = out1;

697

dst_addr += dst_plane_stride;

698

*(dst_addr) = out2;

699

dst_addr += dst_plane_stride;

700

*(dst_addr) = out3;

701

dst_addr += dst_plane_stride;

702

*(dst_addr) = out4;

703

dst_addr += dst_plane_stride;

704

*(dst_addr) = out5;

705

dst_addr += dst_plane_stride;

706

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

707

}

708

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

709

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW

710

*

711

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

712

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

713

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

714

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

715

* @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

716

* @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

717

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

718

*

719

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

720

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

721

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

722

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

723

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

724

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

725

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

726

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

727

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

728

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

729

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

730

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

731

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

732

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

733

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

734

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

735

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

736

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

737

*/

738

__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(

739

TENSOR3D_DECLARATION(src),

740

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int x = get_global_id(0);

745

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

746

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

747

const int z = get_global_id(2) % SRC_DEPTH;

748

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

749

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

750

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

751

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

752

753

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

754

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

755

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

756

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame]

757

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

758

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

759

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

760

761

// Load input tile

762

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

763

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));

764

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

765

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

766

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

767

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

768

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),

769

*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

770

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),

771

*((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),

772

*((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));

773

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

774

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

775

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

776

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

777

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

778

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

779

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

780

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));

781

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));

782

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

783

784

// Calculate common factors for intermediate tensor

785

VEC_DATA_TYPE(DATA_TYPE, 8)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

786

tmp0 = in_row0;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

787

VEC_DATA_TYPE(DATA_TYPE, 8)

788

comm_fact0 = 0.0f;

789

790

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

791

comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

792

tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

793

794

VEC_DATA_TYPE(DATA_TYPE, 8)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

795

comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

796

VEC_DATA_TYPE(DATA_TYPE, 8)

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

797

comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

798

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

799

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;

800

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

801

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

802

comm_fact0 = (DATA_TYPE)2.5f * in_row3;

803

comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

804

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

805

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;

806

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

807

Giorgio Arena

2021-03-22 17:02:26 +0000

[diff] [blame]

808

comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;

809

comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

810

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

811

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;

812

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;

813

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

814

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

815

816

// Calculate output rows (reuse comm_fact0 vector)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

817

VEC_DATA_TYPE(DATA_TYPE, 8)

818

out0;

819

820

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

821

822

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2021-04-01 16:17:16 +0100

[diff] [blame]

823

VEC_DATA_TYPE(DATA_TYPE, 8)

824

out1, out2, out3, out4, out5, out6, out7;

825

826

OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);

827

OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);

828

OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);

829

OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);

830

OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);

831

OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);

832

OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

833

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

834

835

// Store values across the channels

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

836

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

837

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

838

#else /* defined(SRC_DEPTH) */

839

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

840

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

841

842

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;

843

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;

844

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;

845

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;

846

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;

847

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;

848

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;

849

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;

850

851

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

852

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;

853

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;

854

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;

855

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;

856

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;

857

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;

858

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;

859

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;

860

*((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;

861

*((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;

862

*((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;

863

*((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;

864

*((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;

865

*((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;

866

*((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;

867

*((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;

868

*((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;

869

*((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;

870

*((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;

871

*((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;

872

*((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;

873

*((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;

874

*((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;

875

*((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;

876

*((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;

877

*((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;

878

*((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;

879

*((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;

880

*((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;

881

*((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;

882

*((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;

883

*((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;

884

*((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;

885

*((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;

886

*((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;

887

*((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;

888

*((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;

889

*((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;

890

*((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;

891

*((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;

892

*((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;

893

*((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;

894

*((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;

895

*((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;

896

*((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;

897

*((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;

898

*((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;

899

*((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;

900

*((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;

901

*((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;

902

*((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;

903

*((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;

904

*((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;

905

*((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;

906

*((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;

907

*((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;

908

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

909

}

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

910

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

911

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

912

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1

913

*

914

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

915

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

916

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

917

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

918

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

919

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

920

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

921

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

922

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

923

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

924

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

925

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

926

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

927

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

928

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

929

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

930

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

931

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

932

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

933

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

934

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

935

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

936

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

937

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

938

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

939

*/

940

__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(

941

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

942

TENSOR3D_DECLARATION(dst),

943

uint src_stride_w,

944

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

945

{

946

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

961

dst_offset_first_element_in_bytes,

962

src_stride_w,

963

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

964

}

965

966

/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2

967

*

968

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

969

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

970

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

971

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

972

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

973

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

974

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

975

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

976

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

977

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

978

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

979

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

980

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

981

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

982

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

983

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

984

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

985

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

986

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

987

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

988

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

989

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

990

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

991

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

992

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

993

*/

994

__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(

995

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

996

TENSOR3D_DECLARATION(dst),

997

uint src_stride_w,

998

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

999

{

1000

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1015

dst_offset_first_element_in_bytes,

1016

src_stride_w,

1017

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1018

}

1019

1020

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1

1021

*

1022

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1023

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1024

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1025

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1026

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1027

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1028

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1029

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1030

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1031

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1032

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1033

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1034

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1035

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1036

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1037

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1038

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1039

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1040

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1041

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1042

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1043

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1044

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1045

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1046

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1047

*/

1048

__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(

1049

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1050

TENSOR3D_DECLARATION(dst),

1051

uint src_stride_w,

1052

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1053

{

1054

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1069

dst_offset_first_element_in_bytes,

1070

src_stride_w,

1071

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1072

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1073

1074

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW

1075

*

1076

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1077

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1078

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1079

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1080

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1081

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1082

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1083

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1084

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1085

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1086

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1087

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1088

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1089

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1090

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1091

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1092

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1093

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1094

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1095

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1096

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1097

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1098

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1099

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1100

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1101

*/

1102

__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(

1103

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1104

TENSOR3D_DECLARATION(dst),

1105

uint src_stride_w,

1106

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1107

{

1108

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1123

dst_offset_first_element_in_bytes,

1124

src_stride_w,

1125

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1126

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1127

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1128

1129

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1130

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2

1131

*

1132

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1133

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1134

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1135

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1136

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1137

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1138

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1139

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1140

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1141

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1142

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1143

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1144

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1145

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1146

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1147

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1148

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1149

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1150

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1151

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1152

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1153

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1154

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1155

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1156

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1157

*/

1158

__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(

1159

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1160

TENSOR3D_DECLARATION(dst),

1161

uint src_stride_w,

1162

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1163

{

1164

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1179

dst_offset_first_element_in_bytes,

1180

src_stride_w,

1181

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1182

}

1183

1184

/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2

1185

*

1186

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1187

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1188

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1189

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1190

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1191

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1192

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1193

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1194

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1195

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1196

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1197

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1198

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1199

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1200

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1201

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1202

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1203

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1204

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1205

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1206

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1207

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1208

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1209

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1210

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1211

*/

1212

__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(

1213

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1214

TENSOR3D_DECLARATION(dst),

1215

uint src_stride_w,

1216

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1217

{

1218

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1233

dst_offset_first_element_in_bytes,

1234

src_stride_w,

1235

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1236

}

1237

1238

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4

1239

*

1240

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1241

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1242

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1243

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1244

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1245

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1246

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1247

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1248

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1249

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1250

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1251

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1252

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1253

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1254

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1255

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1256

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1257

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1258

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1259

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1260

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1261

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1262

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1263

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1264

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1265

*/

1266

__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(

1267

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1268

TENSOR3D_DECLARATION(dst),

1269

uint src_stride_w,

1270

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1271

{

1272

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1287

dst_offset_first_element_in_bytes,

1288

src_stride_w,

1289

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1290

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1291

1292

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4

1293

*

1294

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1295

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1296

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1297

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1298

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1299

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1300

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1301

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1302

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1303

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1304

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1305

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1306

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1307

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1308

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1309

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1310

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1311

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1312

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1313

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1314

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1315

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1316

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1317

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1318

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1319

*/

1320

__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(

1321

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1322

TENSOR3D_DECLARATION(dst),

1323

uint src_stride_w,

1324

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1325

{

1326

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1341

dst_offset_first_element_in_bytes,

1342

src_stride_w,

1343

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1344

}

Gian Marco Iodice

d28b751

2018-07-06 12:59:28 +0100

[diff] [blame]

1345

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Michele Di Giorgio