Blame - src/core/CL/cl_kernels/winograd_input_transform.cl - ml/ComputeLibrary

2018-10-29 18:01:52 +0000

[diff] [blame]

112

#else /* defined(SRC_DEPTH) */

113

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

114

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

115

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

116

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

117

118

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

119

VEC_DATA_TYPE(DATA_TYPE, 4)

120

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

121

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

122

VEC_DATA_TYPE(DATA_TYPE, 4)

123

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

124

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

125

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

126

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

127

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

128

VEC_DATA_TYPE(DATA_TYPE, 4)

129

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

130

VEC_DATA_TYPE(DATA_TYPE, 4)

131

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

132

VEC_DATA_TYPE(DATA_TYPE, 4)

133

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

134

VEC_DATA_TYPE(DATA_TYPE, 4)

135

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

136

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

137

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

138

VEC_DATA_TYPE(DATA_TYPE, 4)

139

tmp0 = in_row0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

140

141

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

142

tmp0 -= in_row2;

143

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

144

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

145

DATA_TYPE out00 = tmp0.s0 - tmp0.s2;

146

DATA_TYPE out01 = tmp0.s1 + tmp0.s2;

147

DATA_TYPE out02 = tmp0.s2 - tmp0.s1;

148

DATA_TYPE out03 = tmp0.s1 - tmp0.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

149

150

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

151

VEC_DATA_TYPE(DATA_TYPE, 4)

152

tmp1 = in_row1 + in_row2;

153

VEC_DATA_TYPE(DATA_TYPE, 4)

154

tmp2 = in_row2 - in_row1;

155

VEC_DATA_TYPE(DATA_TYPE, 4)

156

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

157

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

158

DATA_TYPE out10 = tmp1.s0 - tmp1.s2;

159

DATA_TYPE out11 = tmp1.s1 + tmp1.s2;

160

DATA_TYPE out12 = tmp1.s2 - tmp1.s1;

161

DATA_TYPE out13 = tmp1.s1 - tmp1.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

162

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

163

DATA_TYPE out20 = tmp2.s0 - tmp2.s2;

164

DATA_TYPE out21 = tmp2.s1 + tmp2.s2;

165

DATA_TYPE out22 = tmp2.s2 - tmp2.s1;

166

DATA_TYPE out23 = tmp2.s1 - tmp2.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

167

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

168

DATA_TYPE out30 = tmp3.s0 - tmp3.s2;

169

DATA_TYPE out31 = tmp3.s1 + tmp3.s2;

170

DATA_TYPE out32 = tmp3.s2 - tmp3.s1;

171

DATA_TYPE out33 = tmp3.s1 - tmp3.s3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

172

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

173

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

174

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

175

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

176

#else /* defined(SRC_DEPTH) */

177

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

178

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

179

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

180

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;

181

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;

182

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;

183

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

184

185

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

186

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;

187

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;

188

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;

189

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;

190

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;

191

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;

192

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;

193

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;

194

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;

195

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;

196

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;

197

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

198

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

199

}

200

201

/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2

202

*

203

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

204

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

205

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

206

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

207

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

208

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

209

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

210

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

211

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

212

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

213

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

214

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

215

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

216

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

217

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

218

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

219

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

220

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

221

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

222

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

223

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

224

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

225

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

226

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

227

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

228

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

229

*/

230

__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(

231

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

232

TENSOR3D_DECLARATION(dst),

233

uint src_stride_w,

234

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

235

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

236

const int x = get_global_id(0);

237

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

238

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

239

const int z = (get_global_id(2) * 2) % SRC_DEPTH;

240

const int b = (get_global_id(2) * 2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

241

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

242

const int z = get_global_id(2) * 2;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

243

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

244

245

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

246

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

247

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

248

#else /* defined(SRC_DEPTH) */

249

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

250

#endif /* defined(SRC_DEPTH) */

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

251

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

252

253

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

254

VEC_DATA_TYPE(DATA_TYPE, 4)

255

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

256

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

257

VEC_DATA_TYPE(DATA_TYPE, 4)

258

in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

259

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

260

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

261

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

262

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

263

VEC_DATA_TYPE(DATA_TYPE, 4)

264

in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

265

VEC_DATA_TYPE(DATA_TYPE, 4)

266

in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

267

VEC_DATA_TYPE(DATA_TYPE, 4)

268

in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

269

VEC_DATA_TYPE(DATA_TYPE, 4)

270

in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

271

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

272

273

src_addr += src_stride_z;

274

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

275

VEC_DATA_TYPE(DATA_TYPE, 4)

276

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

277

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

278

VEC_DATA_TYPE(DATA_TYPE, 4)

279

in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

280

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

281

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

282

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

283

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

284

VEC_DATA_TYPE(DATA_TYPE, 4)

285

in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

286

VEC_DATA_TYPE(DATA_TYPE, 4)

287

in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

288

VEC_DATA_TYPE(DATA_TYPE, 4)

289

in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

290

VEC_DATA_TYPE(DATA_TYPE, 4)

291

in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

292

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

293

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

294

VEC_DATA_TYPE(DATA_TYPE, 4)

295

tmp0 = in_row0;

296

VEC_DATA_TYPE(DATA_TYPE, 4)

297

tmp4 = in_row4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

298

299

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

300

tmp0 -= in_row2;

301

tmp4 -= in_row6;

302

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

303

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

304

VEC_DATA_TYPE(DATA_TYPE, 2)

305

out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);

306

VEC_DATA_TYPE(DATA_TYPE, 2)

307

out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);

308

VEC_DATA_TYPE(DATA_TYPE, 2)

309

out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);

310

VEC_DATA_TYPE(DATA_TYPE, 2)

311

out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

312

313

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

314

VEC_DATA_TYPE(DATA_TYPE, 4)

315

tmp1 = in_row1 + in_row2;

316

VEC_DATA_TYPE(DATA_TYPE, 4)

317

tmp2 = in_row2 - in_row1;

318

VEC_DATA_TYPE(DATA_TYPE, 4)

319

tmp3 = in_row1 - in_row3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

320

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

321

VEC_DATA_TYPE(DATA_TYPE, 4)

322

tmp5 = in_row5 + in_row6;

323

VEC_DATA_TYPE(DATA_TYPE, 4)

324

tmp6 = in_row6 - in_row5;

325

VEC_DATA_TYPE(DATA_TYPE, 4)

326

tmp7 = in_row5 - in_row7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

327

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

328

VEC_DATA_TYPE(DATA_TYPE, 2)

329

out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);

330

VEC_DATA_TYPE(DATA_TYPE, 2)

331

out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);

332

VEC_DATA_TYPE(DATA_TYPE, 2)

333

out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);

334

VEC_DATA_TYPE(DATA_TYPE, 2)

335

out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

336

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

337

VEC_DATA_TYPE(DATA_TYPE, 2)

338

out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);

339

VEC_DATA_TYPE(DATA_TYPE, 2)

340

out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);

341

VEC_DATA_TYPE(DATA_TYPE, 2)

342

out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);

343

VEC_DATA_TYPE(DATA_TYPE, 2)

344

out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

345

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

346

VEC_DATA_TYPE(DATA_TYPE, 2)

347

out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);

348

VEC_DATA_TYPE(DATA_TYPE, 2)

349

out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);

350

VEC_DATA_TYPE(DATA_TYPE, 2)

351

out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);

352

VEC_DATA_TYPE(DATA_TYPE, 2)

353

out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

354

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

355

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

356

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

357

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

358

#else /* defined(SRC_DEPTH) */

359

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

360

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

361

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

362

vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));

363

vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));

364

vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));

365

vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

366

367

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

368

vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));

369

vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));

370

vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));

371

vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));

372

vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));

373

vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));

374

vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));

375

vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));

376

vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));

377

vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));

378

vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));

379

vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

380

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

381

}

382

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

383

/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

384

*

385

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

386

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

387

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

388

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

389

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

390

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

391

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

392

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

393

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

394

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

395

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

396

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

397

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

398

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

399

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

400

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

401

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

402

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

403

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

404

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

405

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

406

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

407

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

408

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

409

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

410

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

411

*/

412

__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(

413

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

414

TENSOR3D_DECLARATION(dst),

415

uint src_stride_w,

416

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

417

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

418

const int x = get_global_id(0);

419

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

420

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

421

const int z = get_global_id(2) % SRC_DEPTH;

422

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

423

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

424

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

425

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

426

427

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

428

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

429

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

430

#else /* defined(SRC_DEPTH) */

431

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

432

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

433

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

434

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

435

436

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

437

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

438

VEC_DATA_TYPE(DATA_TYPE, 4)

439

d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

440

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

441

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

442

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));

443

VEC_DATA_TYPE(DATA_TYPE, 2)

444

d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

445

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

446

#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

447

// Row0

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

448

VEC_DATA_TYPE(DATA_TYPE, 4)

449

d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

450

VEC_DATA_TYPE(DATA_TYPE, 2)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

451

d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

452

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

453

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

454

DATA_TYPE out0 = 0.0f;

455

DATA_TYPE out1 = 0.0f;

456

DATA_TYPE out2 = 0.0f;

457

DATA_TYPE out3 = 0.0f;

458

DATA_TYPE out4 = 0.0f;

459

DATA_TYPE out5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

460

461

// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]

462

out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;

463

out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;

464

out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;

465

out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;

466

out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;

467

out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;

468

469

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

470

// Row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

471

VEC_DATA_TYPE(DATA_TYPE, 4)

472

d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

473

VEC_DATA_TYPE(DATA_TYPE, 2)

474

d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

475

476

// k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

477

DATA_TYPE k0 = d41.s0;

478

DATA_TYPE k1 = d41.s0;

479

DATA_TYPE k2 = d41.s0;

480

DATA_TYPE k3 = d41.s0;

481

DATA_TYPE k4 = d41.s0;

482

DATA_TYPE k5 = 0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

483

484

k0 += 4.0f * d40.s0 - 5.0f * d40.s2;

485

k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;

486

k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;

487

k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;

488

k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;

489

k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;

out0 += k0;

out1 += k1;

out2 += k2;

out3 += k3;

out4 += k4;

out5 += k5;

// Row2

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

499

VEC_DATA_TYPE(DATA_TYPE, 4)

500

d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

501

VEC_DATA_TYPE(DATA_TYPE, 2)

502

d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

503

504

out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;

505

out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;

506

out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;

507

out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;

508

out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;

509

out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;

510

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

511

512

// Compute destination address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

513

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

514

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

515

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

516

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

517

#endif /* defined(SRC_DEPTH) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

518

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

519

uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

520

521

*(dst_addr) = out0;

522

dst_addr += dst_plane_stride;

523

*(dst_addr) = out1;

524

dst_addr += dst_plane_stride;

525

*(dst_addr) = out2;

526

dst_addr += dst_plane_stride;

527

*(dst_addr) = out3;

528

dst_addr += dst_plane_stride;

529

*(dst_addr) = out4;

530

dst_addr += dst_plane_stride;

531

*(dst_addr) = out5;

532

dst_addr += dst_plane_stride;

533

534

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

DATA_TYPE out6 = k0;

DATA_TYPE out7 = k1;

DATA_TYPE out8 = k2;

DATA_TYPE out9 = k3;

DATA_TYPE out10 = k4;

540

DATA_TYPE out11 = k5;

541

DATA_TYPE out12 = k0;

542

DATA_TYPE out13 = k1;

543

DATA_TYPE out14 = k2;

544

DATA_TYPE out15 = k3;

545

DATA_TYPE out16 = k4;

546

DATA_TYPE out17 = k5;

547

DATA_TYPE out18 = k0;

548

DATA_TYPE out19 = k1;

549

DATA_TYPE out20 = k2;

550

DATA_TYPE out21 = k3;

551

DATA_TYPE out22 = k4;

552

DATA_TYPE out23 = k5;

553

DATA_TYPE out24 = k0;

554

DATA_TYPE out25 = k1;

555

DATA_TYPE out26 = k2;

556

DATA_TYPE out27 = k3;

557

DATA_TYPE out28 = k4;

558

DATA_TYPE out29 = k5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

559

560

// Row1

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

561

VEC_DATA_TYPE(DATA_TYPE, 4)

562

d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

563

VEC_DATA_TYPE(DATA_TYPE, 2)

564

d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

565

566

// Row3

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

567

VEC_DATA_TYPE(DATA_TYPE, 4)

568

d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

569

VEC_DATA_TYPE(DATA_TYPE, 2)

570

d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

571

572

// Compute common parts for the channels between [6, 29]

573

// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]

574

// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

575

DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;

576

DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;

577

DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;

578

DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;

579

DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;

580

DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;

581

DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;

582

DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;

583

DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;

584

DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;

585

DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;

586

DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

587

588

// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]

589

// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

590

DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;

591

DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0

592

DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0

593

DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;

594

DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;

595

DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3

596

DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0

597

DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;

598

DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;

599

DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)

600

DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1

601

DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

602

603

out6 += part0 - part1;

604

out12 += part0 + part1;

605

out7 += part2 + part3 + part4 + part5;

606

out8 += part2 - part3 + part4 - part5;

607

out13 += part2 + part3 - part4 - part5;

608

out14 += part2 - part3 - part4 + part5;

609

out9 += part6 + part7 + part8 + part9;

610

out10 += part6 - part7 + part8 - part9;

611

out15 += part6 - part7 - part8 + part9;

612

out16 += part6 + part7 - part8 - part9;

613

out11 += part10 + part11;

614

out17 += part10 - part11;

615

616

out18 += part13 - part12;

617

out24 += part13 + part12;

618

out19 += part14 + part15 + part16 + part17;

619

out20 += part14 - part15 + part16 - part17;

620

out25 += part14 - part15 - part16 + part17;

621

out26 += part14 + part15 - part16 - part17;

622

out21 += part18 + part19 + part20 + part21;

623

out22 += part18 - part19 + part20 - part21;

624

out27 += part18 - part19 - part20 + part21;

625

out28 += part18 + part19 - part20 - part21;

626

out23 += part22 + part23;

627

out29 += part22 - part23;

628

629

*(dst_addr) = out6;

630

dst_addr += dst_plane_stride;

631

*(dst_addr) = out7;

632

dst_addr += dst_plane_stride;

633

*(dst_addr) = out8;

634

dst_addr += dst_plane_stride;

635

*(dst_addr) = out9;

636

dst_addr += dst_plane_stride;

637

*(dst_addr) = out10;

638

dst_addr += dst_plane_stride;

639

*(dst_addr) = out11;

640

dst_addr += dst_plane_stride;

641

*(dst_addr) = out12;

642

dst_addr += dst_plane_stride;

643

*(dst_addr) = out13;

644

dst_addr += dst_plane_stride;

645

*(dst_addr) = out14;

646

dst_addr += dst_plane_stride;

647

*(dst_addr) = out15;

648

dst_addr += dst_plane_stride;

649

*(dst_addr) = out16;

650

dst_addr += dst_plane_stride;

651

*(dst_addr) = out17;

652

dst_addr += dst_plane_stride;

653

654

*(dst_addr) = out18;

655

dst_addr += dst_plane_stride;

656

*(dst_addr) = out19;

657

dst_addr += dst_plane_stride;

658

*(dst_addr) = out20;

659

dst_addr += dst_plane_stride;

660

*(dst_addr) = out21;

661

dst_addr += dst_plane_stride;

662

*(dst_addr) = out22;

663

dst_addr += dst_plane_stride;

664

*(dst_addr) = out23;

665

dst_addr += dst_plane_stride;

666

*(dst_addr) = out24;

667

dst_addr += dst_plane_stride;

668

*(dst_addr) = out25;

669

dst_addr += dst_plane_stride;

670

*(dst_addr) = out26;

671

dst_addr += dst_plane_stride;

672

*(dst_addr) = out27;

673

dst_addr += dst_plane_stride;

674

*(dst_addr) = out28;

675

dst_addr += dst_plane_stride;

676

*(dst_addr) = out29;

677

dst_addr += dst_plane_stride;

678

679

// Row5

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

680

VEC_DATA_TYPE(DATA_TYPE, 4)

681

d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

682

VEC_DATA_TYPE(DATA_TYPE, 2)

683

d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

684

685

// Channels [30, 35]

686

out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

687

out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

688

out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

689

out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

690

out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;

691

out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;

692

693

*(dst_addr) = out0;

694

dst_addr += dst_plane_stride;

695

*(dst_addr) = out1;

696

dst_addr += dst_plane_stride;

697

*(dst_addr) = out2;

698

dst_addr += dst_plane_stride;

699

*(dst_addr) = out3;

700

dst_addr += dst_plane_stride;

701

*(dst_addr) = out4;

702

dst_addr += dst_plane_stride;

703

*(dst_addr) = out5;

704

dst_addr += dst_plane_stride;

705

#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

706

}

707

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

708

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW

709

*

710

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

711

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

712

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

713

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

714

* @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

715

* @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

716

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

717

*

718

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

719

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

720

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

721

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

722

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

723

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

724

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

725

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

726

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

727

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

728

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

729

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

730

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

731

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

732

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

733

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

734

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

735

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

736

*/

737

__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(

738

TENSOR3D_DECLARATION(src),

739

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int x = get_global_id(0);

744

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

745

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

746

const int z = get_global_id(2) % SRC_DEPTH;

747

const int b = get_global_id(2) / SRC_DEPTH;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

748

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

749

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

750

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

751

752

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

753

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

754

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

755

#else /* defined(SRC_DEPTH) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

756

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

757

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

758

src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);

759

760

// Load input tile

761

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

762

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));

763

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)

764

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),

765

*((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),

766

*((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),

767

*((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),

768

*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),

769

*((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),

770

*((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),

771

*((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));

772

#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

773

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));

774

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));

775

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));

776

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));

777

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));

778

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));

779

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));

780

const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));

781

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

782

783

// Calculate common factors for intermediate tensor

784

VEC_DATA_TYPE(DATA_TYPE, 8)

785

tmp0 = in_row0;

786

VEC_DATA_TYPE(DATA_TYPE, 8)

787

comm_fact0 = 0.0f;

788

789

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

790

comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;

791

tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;

792

793

VEC_DATA_TYPE(DATA_TYPE, 8)

794

comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;

795

VEC_DATA_TYPE(DATA_TYPE, 8)

796

comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;

797

798

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;

799

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;

800

801

comm_fact0 = (DATA_TYPE)2.5 * in_row3;

802

comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;

803

804

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;

805

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;

806

807

comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;

808

comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;

809

810

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;

811

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;

812

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;

813

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

814

815

// Calculate output rows (reuse comm_fact0 vector)

816

VEC_DATA_TYPE(DATA_TYPE, 8)

817

out0;

818

819

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

820

821

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

822

VEC_DATA_TYPE(DATA_TYPE, 8)

823

out1, out2, out3, out4, out5, out6, out7;

824

825

OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);

826

OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);

827

OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);

828

OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);

829

OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);

830

OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);

831

OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);

832

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

833

834

// Store values across the channels

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

835

#if defined(SRC_DEPTH)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

836

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

837

#else /* defined(SRC_DEPTH) */

838

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;

839

#endif /* defined(SRC_DEPTH) */

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

840

841

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;

842

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;

843

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;

844

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;

845

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;

846

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;

847

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;

848

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;

849

850

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

851

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;

852

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;

853

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;

854

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;

855

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;

856

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;

857

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;

858

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;

859

*((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;

860

*((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;

861

*((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;

862

*((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;

863

*((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;

864

*((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;

865

*((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;

866

*((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;

867

*((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;

868

*((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;

869

*((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;

870

*((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;

871

*((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;

872

*((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;

873

*((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;

874

*((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;

875

*((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;

876

*((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;

877

*((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;

878

*((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;

879

*((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;

880

*((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;

881

*((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;

882

*((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;

883

*((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;

884

*((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;

885

*((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;

886

*((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;

887

*((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;

888

*((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;

889

*((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;

890

*((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;

891

*((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;

892

*((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;

893

*((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;

894

*((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;

895

*((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;

896

*((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;

897

*((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;

898

*((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;

899

*((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;

900

*((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;

901

*((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;

902

*((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;

903

*((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;

904

*((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;

905

*((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;

906

*((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;

907

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

908

}

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

909

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

910

#if defined(SRC_DIM_1) && defined(SRC_DIM_2)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

911

/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

912

*

913

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

914

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

915

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

916

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

917

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

918

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

919

* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

920

* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

921

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

922

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

923

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

924

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

925

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

926

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

927

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

928

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

929

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

930

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

931

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

932

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

933

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

934

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

935

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

936

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

937

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

938

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

939

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

940

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

941

*/

942

__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(

943

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

944

TENSOR3D_DECLARATION(dst),

945

uint src_stride_w,

946

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

947

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

948

const int x = get_global_id(0);

949

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

950

#if defined(NUM_TILES_Y)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

951

const int z = get_global_id(2) % NUM_TILES_Y;

952

const int b = get_global_id(2) / NUM_TILES_Y;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

953

#else /* defined(NUM_TILES_Y) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

954

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

955

#endif /* defined(NUM_TILES_Y) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

956

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

957

#if defined(NUM_TILES_Y)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

958

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

959

#else /* defined(NUM_TILES_Y) */

960

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);

961

#endif /* defined(NUM_TILES_Y) */

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

962

963

// Clamp coordinates. This clamp is valid for all rows

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

964

int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;

965

int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

966

y_coord0 = clamp(y_coord0, (int4) - 1, (int4)SRC_DIM_1);

967

y_coord1 = clamp(y_coord1, (int2) - 1, (int2)SRC_DIM_1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

968

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

int z_coord;

int4 valid_y0;

int2 valid_y1;

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

974

// Row4

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

975

z_coord = (z * 4) - (int)PAD_TOP + 4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

976

977

// If z < 0, set y to -1

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

978

valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);

979

valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

980

// If z >= SRC_DIM_2, set y to SRC_DIM_2

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

981

valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);

982

valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

983

984

// Clamp z coordinate

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

985

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

986

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

987

DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

988

DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

989

DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

990

DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

991

DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

992

DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

993

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

DATA_TYPE k0 = d44;

DATA_TYPE k1 = d44;

DATA_TYPE k2 = d44;

DATA_TYPE k3 = d44;

DATA_TYPE k4 = d44;

DATA_TYPE k5 = (DATA_TYPE)0.0f;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1000

1001

k0 += 4.0f * d40 - 5.0f * d42;

1002

k1 += -4.0f * d41 - 4.0f * d42 + d43;

1003

k2 += 4.0f * d41 - 4.0f * d42 - d43;

1004

k3 += -2.0f * d41 + 2.0f * d43 - d42;

1005

k4 += 2.0f * d41 - 2.0f * d43 - d42;

1006

k5 += 4.0f * d41 - 5.0f * d43 + d45;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1007

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1008

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1009

#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1010

// Row0

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1011

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1012

1013

#if PAD_TOP != 0

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1014

valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);

1015

valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);

1016

valid_y0 = select(valid_y0, (int)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);

1017

valid_y1 = select(valid_y1, (int)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);

1018

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1019

#else // PAD_TOP != 0

1020

valid_y0 = y_coord0;

1021

valid_y1 = y_coord1;

1022

#endif // if PAD_TOP == 0, we cannot read out of bound

1023

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1024

DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

1025

DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

1026

DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

1027

DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

1028

DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

1029

DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1030

#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1031

int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;

1032

int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1033

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1034

valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);

1035

valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);

1036

valid_y0 = select(valid_y0, (int4)SRC_DIM_1, z_coords0 >= (int4)SRC_DIM_2);

1037

valid_y1 = select(valid_y1, (int2)SRC_DIM_1, z_coords1 >= (int2)SRC_DIM_2);

1038

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1039

z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));

1040

z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1041

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1042

DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);

1043

DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);

1044

DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);

1045

DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);

1046

DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);

1047

DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1048

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1049

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1050

DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;

1051

DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;

1052

DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;

1053

DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;

1054

DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;

1055

DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1056

1057

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1058

// Row2

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1059

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;

1060

valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);

1061

valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);

1062

valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);

1063

valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);

1064

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1065

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1066

DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

1067

DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

1068

DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

1069

DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

1070

DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

1071

DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1072

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

out0 += k0;

out1 += k1;

out2 += k2;

out3 += k3;

out4 += k4;

out5 += k5;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

DATA_TYPE out6 = k0;

DATA_TYPE out7 = k1;

DATA_TYPE out8 = k2;

DATA_TYPE out9 = k3;

DATA_TYPE out10 = k4;

1084

DATA_TYPE out11 = k5;

1085

DATA_TYPE out12 = k0;

1086

DATA_TYPE out13 = k1;

1087

DATA_TYPE out14 = k2;

1088

DATA_TYPE out15 = k3;

1089

DATA_TYPE out16 = k4;

1090

DATA_TYPE out17 = k5;

1091

DATA_TYPE out18 = k0;

1092

DATA_TYPE out19 = k1;

1093

DATA_TYPE out20 = k2;

1094

DATA_TYPE out21 = k3;

1095

DATA_TYPE out22 = k4;

1096

DATA_TYPE out23 = k5;

1097

DATA_TYPE out24 = k0;

1098

DATA_TYPE out25 = k1;

1099

DATA_TYPE out26 = k2;

1100

DATA_TYPE out27 = k3;

1101

DATA_TYPE out28 = k4;

1102

DATA_TYPE out29 = k5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1103

1104

// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1105

out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;

1106

out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;

1107

out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;

1108

out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;

1109

out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;

1110

out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;

1111

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1112

1113

// Compute destination address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1114

#if defined(NUM_TILES_Y)

1115

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);

1116

#else /* defined(NUM_TILES_Y) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1117

__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1118

#endif /* defined(NUM_TILES_Y) */

1119

1120

uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1121

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1122

*((__global DATA_TYPE *)dst_addr) = out0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1123

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1124

*((__global DATA_TYPE *)dst_addr) = out1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1125

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1126

*((__global DATA_TYPE *)dst_addr) = out2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1127

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1128

*((__global DATA_TYPE *)dst_addr) = out3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1129

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1130

*((__global DATA_TYPE *)dst_addr) = out4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1131

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1132

*((__global DATA_TYPE *)dst_addr) = out5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1133

dst_addr += dst_plane_stride;

1134

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1135

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1136

// Row1

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1137

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1138

// Row1 can never be out of bounds

1139

valid_y0 = y_coord0;

1140

valid_y1 = y_coord1;

1141

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1142

DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

1143

DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

1144

DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

1145

DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

1146

DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

1147

DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1148

1149

// Row3

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1150

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;

1151

valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);

1152

valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);

1153

valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);

1154

valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);

1155

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1156

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1157

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1158

DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

1159

DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

1160

DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

1161

DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

1162

DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

1163

DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1164

1165

// Compute common parts for the channels between [6, 29]

1166

// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]

1167

// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1168

DATA_TYPE part0 = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;

1169

DATA_TYPE part1 = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;

1170

DATA_TYPE part2 = 16.0f * d22 - 4.0f * d24;

1171

DATA_TYPE part3 = 16.0f * d21 - 4.0f * d23;

1172

DATA_TYPE part4 = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;

1173

DATA_TYPE part5 = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;

1174

DATA_TYPE part6 = 4.0f * d22 - 4.0f * d24;

1175

DATA_TYPE part7 = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;

1176

DATA_TYPE part8 = 4.0f * d12 - 4.0f * d14 - d32 + d34;

1177

DATA_TYPE part9 = 8.0f * d21 - 8.0f * d23;

1178

DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;

1179

DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1180

1181

// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]

1182

// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1183

DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;

1184

DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24

1185

DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24

1186

DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;

1187

DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;

1188

DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23

1189

DATA_TYPE part18 = part6 * 0.25f; // d22 - d24

1190

DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;

1191

DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;

1192

DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d21 - d23)

1193

DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d21 + 5.0f * d23 - d25

1194

DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1195

1196

out6 += part0 - part1;

1197

out12 += part0 + part1;

1198

out7 += part2 + part3 + part4 + part5;

1199

out8 += part2 - part3 + part4 - part5;

1200

out13 += part2 + part3 - part4 - part5;

1201

out14 += part2 - part3 - part4 + part5;

1202

out9 += part6 + part7 + part8 + part9;

1203

out10 += part6 - part7 + part8 - part9;

1204

out15 += part6 - part7 - part8 + part9;

1205

out16 += part6 + part7 - part8 - part9;

1206

out11 += part10 + part11;

1207

out17 += part10 - part11;

1208

1209

out18 += part13 - part12;

1210

out24 += part13 + part12;

1211

out19 += part14 + part15 + part16 + part17;

1212

out20 += part14 - part15 + part16 - part17;

1213

out25 += part14 - part15 - part16 + part17;

1214

out26 += part14 + part15 - part16 - part17;

1215

out21 += part18 + part19 + part20 + part21;

1216

out22 += part18 - part19 + part20 - part21;

1217

out27 += part18 - part19 - part20 + part21;

1218

out28 += part18 + part19 - part20 - part21;

1219

out23 += part22 + part23;

1220

out29 += part22 - part23;

1221

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1222

*((__global DATA_TYPE *)dst_addr) = out6;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1223

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1224

*((__global DATA_TYPE *)dst_addr) = out7;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1225

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1226

*((__global DATA_TYPE *)dst_addr) = out8;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1227

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1228

*((__global DATA_TYPE *)dst_addr) = out9;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1229

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1230

*((__global DATA_TYPE *)dst_addr) = out10;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1231

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1232

*((__global DATA_TYPE *)dst_addr) = out11;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1233

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1234

*((__global DATA_TYPE *)dst_addr) = out12;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1235

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1236

*((__global DATA_TYPE *)dst_addr) = out13;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1237

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1238

*((__global DATA_TYPE *)dst_addr) = out14;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1239

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1240

*((__global DATA_TYPE *)dst_addr) = out15;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1241

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1242

*((__global DATA_TYPE *)dst_addr) = out16;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1243

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1244

*((__global DATA_TYPE *)dst_addr) = out17;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1245

dst_addr += dst_plane_stride;

1246

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1247

*((__global DATA_TYPE *)dst_addr) = out18;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1248

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1249

*((__global DATA_TYPE *)dst_addr) = out19;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1250

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1251

*((__global DATA_TYPE *)dst_addr) = out20;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1252

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1253

*((__global DATA_TYPE *)dst_addr) = out21;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1254

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1255

*((__global DATA_TYPE *)dst_addr) = out22;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1256

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1257

*((__global DATA_TYPE *)dst_addr) = out23;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1258

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1259

*((__global DATA_TYPE *)dst_addr) = out24;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1260

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1261

*((__global DATA_TYPE *)dst_addr) = out25;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1262

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1263

*((__global DATA_TYPE *)dst_addr) = out26;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1264

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1265

*((__global DATA_TYPE *)dst_addr) = out27;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1266

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1267

*((__global DATA_TYPE *)dst_addr) = out28;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1268

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1269

*((__global DATA_TYPE *)dst_addr) = out29;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1270

dst_addr += dst_plane_stride;

1271

1272

// Row5

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1273

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;

1274

valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);

1275

valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);

1276

valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);

1277

valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);

1278

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1279

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1280

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1281

DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);

1282

DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);

1283

DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);

1284

DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);

1285

DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);

1286

DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1287

1288

// Channels [30, 35]

1289

out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;

1290

out1 = -16.0f * d11 - 16.0f * d12 + 4.0f * d13 + 20.0f * d31 + 20.0f * d32 - 5.0f * d33 - 4.0f * d51 - 4.0f * d52 + d53 + d54 + 4.0f * d14 - 5.0f * d34;

1291

out2 = 16.0f * d11 - 16.0f * d12 - 4.0f * d13 - 20.0f * d31 + 20.0f * d32 + 5.0f * d33 + 4.0f * d51 - 4.0f * d52 - d53 + d54 + 4.0f * d14 - 5.0f * d34;

1292

out3 = -8.0f * d11 - 4.0f * d12 + 8.0f * d13 + 10.0f * d31 - 10.0f * d33 + 5.0f * d32 - 2.0f * d51 + 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;

1293

out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;

1294

out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;

1295

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1296

*((__global DATA_TYPE *)dst_addr) = out0;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1297

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1298

*((__global DATA_TYPE *)dst_addr) = out1;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1299

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1300

*((__global DATA_TYPE *)dst_addr) = out2;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1301

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1302

*((__global DATA_TYPE *)dst_addr) = out3;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1303

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1304

*((__global DATA_TYPE *)dst_addr) = out4;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1305

dst_addr += dst_plane_stride;

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1306

*((__global DATA_TYPE *)dst_addr) = out5;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1307

dst_addr += dst_plane_stride;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1308

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1309

}

1310

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1311

/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1312

*

1313

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

1314

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1315

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

1316

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1317

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1318

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1319

* @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1320

* @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1321

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1322

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1323

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1324

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1325

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1326

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1327

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1328

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1329

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1330

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1331

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1332

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1333

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1334

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1335

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1336

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1337

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1338

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1339

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1340

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1341

*/

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1342

__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1343

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1344

TENSOR3D_DECLARATION(dst),

1345

uint src_stride_w,

1346

uint dst_stride_w)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

1347

{

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1348

const int x = get_global_id(0);

1349

const int y = get_global_id(1);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1350

#if defined(NUM_TILES_Y)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1351

const int z = get_global_id(2) % NUM_TILES_Y;

1352

const int b = get_global_id(2) / NUM_TILES_Y;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1353

#else /* defined(NUM_TILES_Y) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1354

const int z = get_global_id(2);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1355

#endif /* defined(NUM_TILES_Y) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1356

1357

// Compute input address

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1358

#if defined(NUM_TILES_Y)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1359

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1360

#else /* defined(NUM_TILES_Y) */

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1361

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1362

#endif /* defined(NUM_TILES_Y) */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1363

1364

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1365

// Clamp coordinates. This clamp is valid for all rows

1366

int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1367

y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1368

1369

// Row0

1370

// We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels

1371

int z_coord = z * OUTPUT_TILE_H;

1372

1373

// Load the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1374

VEC_DATA_TYPE(DATA_TYPE, 8)

1375

in_row0;

1376

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);

1377

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);

1378

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);

1379

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);

1380

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);

1381

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);

1382

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);

1383

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1384

1385

// Calculate common factors for intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1386

VEC_DATA_TYPE(DATA_TYPE, 8)

1387

comm_fact0 = 0.0f;

1388

VEC_DATA_TYPE(DATA_TYPE, 8)

1389

tmp0 = in_row0;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1390

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1391

VEC_DATA_TYPE(DATA_TYPE, 8)

1392

out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1393

1394

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

1395

1396

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1397

// We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1398

int y_coord = y * (int)OUTPUT_TILE_W;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1399

1400

// Row0

1401

// We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels

1402

int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1403

int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0); // If z < 0, set y to -1

1404

valid_y = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

1405

z_coord = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1); // Clamp z coordinate

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1406

1407

// Load the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1408

VEC_DATA_TYPE(DATA_TYPE, 8)

1409

in_row0;

1410

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);

1411

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);

1412

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);

1413

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);

1414

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);

1415

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);

1416

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);

1417

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1418

1419

// Calculate common factors for intermediate tensor

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1420

VEC_DATA_TYPE(DATA_TYPE, 8)

1421

comm_fact0 = 0.0f;

1422

VEC_DATA_TYPE(DATA_TYPE, 8)

1423

tmp0 = in_row0;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1424

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1425

VEC_DATA_TYPE(DATA_TYPE, 8)

1426

out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1427

1428

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

1429

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1430

VEC_DATA_TYPE(DATA_TYPE, 8)

1431

in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1432

1433

// Clamp coordinates. This clamp is valid for all rows

1434

int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1435

y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1436

1437

// Row0

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1438

int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;

1439

int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1

1440

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

1441

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1); // Clamp z coordinate

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1442

1443

// Load the input tile

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1444

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1445

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1446

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1447

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1448

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1449

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1450

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1451

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1452

1453

// Row1

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1454

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;

1455

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1456

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1457

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1458

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1459

in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1460

in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1461

in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1462

in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1463

in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1464

in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1465

in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1466

in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1467

1468

// Row2

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1469

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;

1470

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1471

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1472

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1473

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1474

in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1475

in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1476

in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1477

in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1478

in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1479

in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1480

in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1481

in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1482

1483

// Row3

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1484

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;

1485

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1486

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1487

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1488

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1489

in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1490

in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1491

in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1492

in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1493

in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1494

in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1495

in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1496

in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1497

1498

// Row4

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1499

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;

1500

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1501

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1502

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1503

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1504

in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1505

in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1506

in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1507

in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1508

in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1509

in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1510

in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1511

in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1512

1513

// Row5

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1514

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;

1515

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1516

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1517

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1518

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1519

in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1520

in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1521

in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1522

in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1523

in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1524

in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1525

in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1526

in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1527

1528

// Row6

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1529

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;

1530

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1531

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1532

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1533

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1534

in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1535

in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1536

in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1537

in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1538

in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1539

in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1540

in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1541

in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1542

1543

// Row7

Georgios Pinitas

2018-07-18 18:06:32 +0100

[diff] [blame]

1544

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;

1545

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1546

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1547

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1548

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1549

in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);

1550

in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);

1551

in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);

1552

in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);

1553

in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);

1554

in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);

1555

in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);

1556

in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1557

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1558

VEC_DATA_TYPE(DATA_TYPE, 8)

1559

comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;

1560

VEC_DATA_TYPE(DATA_TYPE, 8)

1561

comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;

1562

VEC_DATA_TYPE(DATA_TYPE, 8)

1563

comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1564

1565

// Calculate intermediate tensor and reuse common factor vectors

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1566

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;

1567

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;

1568

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1569

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1570

comm_fact0 = (DATA_TYPE)2.5f * in_row3;

1571

comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1572

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1573

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;

1574

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1575

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1576

comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;

1577

comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1578

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1579

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;

1580

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;

1581

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1582

1583

// Calculate output rows (reuse comm_fact0 vector)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1584

VEC_DATA_TYPE(DATA_TYPE, 8)

1585

out0, out1, out2, out3, out4, out5, out6, out7;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1586

OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);

1587

OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);

1588

OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);

1589

OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);

1590

OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);

1591

OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);

1592

OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);

1593

OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

1594

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1595

1596

// Store values across the channels

1597

#if defined(NUM_TILES_Y)

1598

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

1599

#else /* NUM_TILES_Y */

1600

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;

1601

#endif /* NUM_TILES_Y */

1602

1603

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;

1604

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;

1605

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;

1606

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;

1607

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;

1608

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;

1609

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;

1610

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;

1611

1612

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1613

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;

1614

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;

1615

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;

1616

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;

1617

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;

1618

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;

1619

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;

1620

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;

1621

*((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;

1622

*((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;

1623

*((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;

1624

*((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;

1625

*((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;

1626

*((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;

1627

*((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;

1628

*((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;

1629

*((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;

1630

*((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;

1631

*((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;

1632

*((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;

1633

*((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;

1634

*((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;

1635

*((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;

1636

*((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;

1637

*((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;

1638

*((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;

1639

*((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;

1640

*((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;

1641

*((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;

1642

*((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;

1643

*((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;

1644

*((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;

1645

*((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;

1646

*((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;

1647

*((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;

1648

*((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;

1649

*((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;

1650

*((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;

1651

*((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;

1652

*((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;

1653

*((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;

1654

*((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;

1655

*((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;

1656

*((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;

1657

*((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;

1658

*((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;

1659

*((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;

1660

*((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;

1661

*((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;

1662

*((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;

1663

*((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;

1664

*((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;

1665

*((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;

1666

*((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;

1667

*((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;

1668

*((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;

1669

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

1670

}

1671

1672

/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC

1673

*

1674

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).

1675

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

1676

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

1677

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1678

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1679

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1680

* @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1681

* @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

1682

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

1683

*

1684

* @param[in] src_ptr Pointer to the source image. Supported data types: F32

1685

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

1686

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1687

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

1688

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1689

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

1690

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1691

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1692

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

1693

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1694

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1695

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1696

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1697

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1698

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1699

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1700

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1701

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1702

*/

1703

__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(

1704

TENSOR3D_DECLARATION(src),

1705

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int x = get_global_id(0);

1710

const int y = get_global_id(1);

1711

#if defined(NUM_TILES_Y)

1712

const int z = get_global_id(2) % NUM_TILES_Y;

1713

const int b = get_global_id(2) / NUM_TILES_Y;

1714

#else /* defined(NUM_TILES_Y) */

1715

const int z = get_global_id(2);

1716

#endif /* defined(NUM_TILES_Y) */

1717

1718

// Compute input address

1719

#if defined(NUM_TILES_Y)

1720

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;

1721

#else /* defined(NUM_TILES_Y) */

1722

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);

1723

#endif /* defined(NUM_TILES_Y) */

1724

1725

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1726

1727

// Clamp coordinates. This clamp is valid for all rows

1728

int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;

1729

y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);

1730

1731

// Clamp coordinates. This clamp is valid for all columns

1732

int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;

1733

int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1

1734

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

1735

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1736

1737

// Load the input tile

1738

VEC_DATA_TYPE(DATA_TYPE, 8)

1739

in_row0;

1740

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1741

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1742

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1743

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1744

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1745

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1746

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1747

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1748

1749

VEC_DATA_TYPE(DATA_TYPE, 8)

1750

out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

1751

1752

VEC_DATA_TYPE(DATA_TYPE, 8)

1753

tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;

1754

1755

VEC_DATA_TYPE(DATA_TYPE, 8)

1756

comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

1757

1758

OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);

1759

1760

#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1761

// We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels

1762

int y_coord = y * (int)OUTPUT_TILE_W;

1763

1764

// Row0

1765

// We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels

1766

int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;

1767

int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0); // If z < 0, set y to -1

1768

valid_y = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

1769

z_coord = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1); // Clamp z coordinate

1770

1771

// Load the input tile

1772

VEC_DATA_TYPE(DATA_TYPE, 8)

1773

in_row0;

1774

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * (int)src_stride_z);

1775

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * (int)src_stride_z);

1776

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * (int)src_stride_z);

1777

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * (int)src_stride_z);

1778

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * (int)src_stride_z);

1779

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * (int)src_stride_z);

1780

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * (int)src_stride_z);

1781

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * (int)src_stride_z);

1782

1783

// Calculate common factors for intermediate tensor

1784

VEC_DATA_TYPE(DATA_TYPE, 8)

1785

tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;

1786

1787

VEC_DATA_TYPE(DATA_TYPE, 8)

1788

out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

1789

1790

VEC_DATA_TYPE(DATA_TYPE, 8)

1791

comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;

1792

1793

OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);

1794

#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

1795

VEC_DATA_TYPE(DATA_TYPE, 8)

1796

in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;

1797

1798

// Clamp coordinates. This clamp is valid for all rows

1799

int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;

1800

y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);

1801

1802

// Row0

1803

int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;

1804

int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1

1805

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

1806

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1); // Clamp z coordinate

1807

1808

// Load the input tile

1809

in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1810

in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1811

in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1812

in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1813

in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1814

in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1815

in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1816

in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1817

1818

// Row1

1819

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;

1820

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1821

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1822

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1823

1824

in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1825

in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1826

in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1827

in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1828

in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1829

in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1830

in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1831

in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1832

1833

// Row2

1834

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;

1835

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1836

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1837

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1838

1839

in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1840

in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1841

in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1842

in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1843

in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1844

in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1845

in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1846

in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1847

1848

// Row3

1849

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;

1850

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1851

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1852

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1853

1854

in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1855

in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1856

in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1857

in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1858

in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1859

in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1860

in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1861

in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1862

1863

// Row4

1864

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;

1865

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1866

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1867

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1868

1869

in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1870

in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1871

in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1872

in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1873

in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1874

in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1875

in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1876

in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1877

1878

// Row5

1879

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;

1880

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1881

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1882

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1883

1884

in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1885

in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1886

in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1887

in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1888

in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1889

in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1890

in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1891

in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1892

1893

// Row6

1894

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;

1895

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1896

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1897

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1898

1899

in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1900

in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1901

in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1902

in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1903

in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1904

in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1905

in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1906

in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1907

1908

// Row7

1909

z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;

1910

valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);

1911

valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);

1912

z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);

1913

1914

in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);

1915

in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);

1916

in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);

1917

in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);

1918

in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);

1919

in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);

1920

in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);

1921

in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);

1922

1923

VEC_DATA_TYPE(DATA_TYPE, 8)

1924

comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;

1925

VEC_DATA_TYPE(DATA_TYPE, 8)

1926

comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;

1927

VEC_DATA_TYPE(DATA_TYPE, 8)

1928

comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;

1929

VEC_DATA_TYPE(DATA_TYPE, 8)

1930

comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;

1931

VEC_DATA_TYPE(DATA_TYPE, 8)

1932

comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;

1933

VEC_DATA_TYPE(DATA_TYPE, 8)

1934

comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;

1935

1936

// Calculate intermediate tensors

1937

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;

1938

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;

1939

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;

1940

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;

1941

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;

1942

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;

1943

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;

1944

const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;

1945

1946

VEC_DATA_TYPE(DATA_TYPE, 8)

1947

out0, out1, out2, out3, out4, out5, out6, out7;

1948

1949

OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);

1950

OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);

1951

OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);

1952

OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);

1953

OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);

1954

OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);

1955

OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);

1956

OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);

1957

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1958

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1959

1960

// Store values across the channels

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1961

#if defined(NUM_TILES_Y)

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

1962

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

1963

#else /* NUM_TILES_Y */

1964

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;

1965

#endif /* NUM_TILES_Y */

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1966

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1967

*((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;

1968

*((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;

1969

*((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;

1970

*((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;

1971

*((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;

1972

*((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;

1973

*((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;

1974

*((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1975

1976

#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

1977

*((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;

1978

*((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;

1979

*((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;

1980

*((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;

1981

*((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;

1982

*((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;

1983

*((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;

1984

*((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;

1985

*((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;

1986

*((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;

1987

*((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;

1988

*((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;

1989

*((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;

1990

*((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;

1991

*((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;

1992

*((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;

1993

*((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;

1994

*((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;

1995

*((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;

1996

*((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;

1997

*((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;

1998

*((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;

1999

*((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;

2000

*((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;

2001

*((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;

2002

*((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;

2003

*((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;

2004

*((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;

2005

*((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;

2006

*((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;

2007

*((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;

2008

*((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;

2009

*((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;

2010

*((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;

2011

*((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;

2012

*((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;

2013

*((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;

2014

*((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;

2015

*((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;

2016

*((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;

2017

*((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;

2018

*((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;

2019

*((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;

2020

*((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;

2021

*((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;

2022

*((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;

2023

*((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;

2024

*((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;

2025

*((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;

2026

*((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;

2027

*((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;

2028

*((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;

2029

*((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;

2030

*((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;

2031

*((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;

2032

*((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2033

#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

2034

}

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

2035

#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2036

2037

#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

2038

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1

2039

*

2040

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2041

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2042

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

2043

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2044

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2045

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2046

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2047

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2048

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2049

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2050

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2051

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2052

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2053

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2054

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2055

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2056

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2057

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2058

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2059

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2060

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2061

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2062

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2063

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2064

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2065

*/

2066

__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(

2067

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2068

TENSOR3D_DECLARATION(dst),

2069

uint src_stride_w,

2070

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2071

{

2072

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2087

dst_offset_first_element_in_bytes,

2088

src_stride_w,

2089

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2090

}

2091

2092

/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2

2093

*

2094

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2095

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2096

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

2097

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2098

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2099

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2100

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2101

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2102

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2103

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2104

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2105

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2106

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2107

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2108

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2109

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2110

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2111

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2112

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2113

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2114

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2115

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2116

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2117

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2118

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2119

*/

2120

__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(

2121

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2122

TENSOR3D_DECLARATION(dst),

2123

uint src_stride_w,

2124

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2125

{

2126

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2141

dst_offset_first_element_in_bytes,

2142

src_stride_w,

2143

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2144

}

2145

2146

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1

2147

*

2148

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2149

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2150

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

2151

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2152

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2153

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2154

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2155

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2156

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2157

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2158

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2159

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2160

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2161

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2162

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2163

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2164

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2165

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2166

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2167

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2168

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2169

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2170

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2171

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2172

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2173

*/

2174

__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(

2175

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2176

TENSOR3D_DECLARATION(dst),

2177

uint src_stride_w,

2178

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2179

{

2180

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2195

dst_offset_first_element_in_bytes,

2196

src_stride_w,

2197

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2198

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2199

2200

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW

2201

*

2202

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2203

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2204

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

2205

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

2206

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2207

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2208

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2209

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2210

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2211

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2212

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2213

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2214

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2215

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2216

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2217

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2218

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2219

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2220

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2221

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2222

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2223

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2224

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2225

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2226

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2227

*/

2228

__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(

2229

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2230

TENSOR3D_DECLARATION(dst),

2231

uint src_stride_w,

2232

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2233

{

2234

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2249

dst_offset_first_element_in_bytes,

2250

src_stride_w,

2251

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2252

}

2253

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

2254

#if defined(SRC_DIM_1) && defined(SRC_DIM_2)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2255

/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC

2256

*

2257

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2258

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2259

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

2260

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2261

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

2262

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2263

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2264

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2265

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2266

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2267

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2268

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2269

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2270

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2272

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2273

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2274

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2275

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2276

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2277

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2278

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2279

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2280

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2281

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2282

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2283

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2284

*/

2285

__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(

2286

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2287

TENSOR3D_DECLARATION(dst),

2288

uint src_stride_w,

2289

uint dst_stride_w)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2290

{

2291

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2306

dst_offset_first_element_in_bytes,

2307

src_stride_w,

2308

dst_stride_w);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2309

}

2310

2311

/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC

2312

*

2313

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2314

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2315

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

2316

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2317

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

2318

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2319

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2320

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2321

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2322

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2323

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2324

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2325

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2326

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2327

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2328

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2329

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2330

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2331

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2332

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2333

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2334

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2335

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2336

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2337

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2338

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2339

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2340

*/

2341

__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(

2342

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2343

TENSOR3D_DECLARATION(dst),

2344

uint src_stride_w,

2345

uint dst_stride_w)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2346

{

2347

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2362

dst_offset_first_element_in_bytes,

2363

src_stride_w,

2364

dst_stride_w);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2365

}

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

2366

2367

/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC

2368

*

2369

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).

2370

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2371

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

2372

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2373

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7

2374

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

2375

* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

2376

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

2377

*

2378

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

2379

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2380

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2381

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2382

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2383

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2384

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2385

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2386

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2387

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2388

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2389

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2390

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2391

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2392

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2393

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2394

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2395

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

2396

*/

2397

__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(

2398

TENSOR3D_DECLARATION(src),

2399

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

2419

src_stride_w,

2420

dst_stride_w);

2421

}

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2422

#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2423

#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)

2424

2425

#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

2426

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2

2427

*

2428

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2429

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2430

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2431

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

2432

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2433

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2434

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2435

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2436

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2437

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2438

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2439

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2440

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2441

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2442

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2443

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2444

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2445

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2446

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2447

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2448

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2449

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2450

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2451

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2452

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2453

*/

2454

__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(

2455

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2456

TENSOR3D_DECLARATION(dst),

2457

uint src_stride_w,

2458

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2459

{

2460

winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2475

dst_offset_first_element_in_bytes,

2476

src_stride_w,

2477

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2478

}

2479

2480

/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2

2481

*

2482

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2483

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2484

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2485

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

2486

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2487

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2488

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2489

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2490

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2491

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2492

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2493

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2494

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2495

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2496

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2497

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2498

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2499

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2500

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2501

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2502

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2503

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2504

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2505

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2506

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2507

*/

2508

__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(

2509

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2510

TENSOR3D_DECLARATION(dst),

2511

uint src_stride_w,

2512

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2513

{

2514

winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2529

dst_offset_first_element_in_bytes,

2530

src_stride_w,

2531

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2532

}

2533

2534

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4

2535

*

2536

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2537

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2538

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2539

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2540

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2541

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2542

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2543

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2544

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2545

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2546

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2547

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2548

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2549

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2550

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2551

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2552

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2553

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2554

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2555

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2556

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2557

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2558

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2559

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2560

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2561

*/

2562

__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(

2563

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2564

TENSOR3D_DECLARATION(dst),

2565

uint src_stride_w,

2566

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2567

{

2568

winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2583

dst_offset_first_element_in_bytes,

2584

src_stride_w,

2585

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2586

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2587

2588

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4

2589

*

2590

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2591

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2592

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2593

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2594

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2595

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2596

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2597

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2598

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2599

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2600

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2601

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2602

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2603

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2604

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2605

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2606

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2607

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2608

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2609

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2610

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2611

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2612

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2613

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2614

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2615

*/

2616

__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(

2617

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2618

TENSOR3D_DECLARATION(dst),

2619

uint src_stride_w,

2620

uint dst_stride_w)

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2621

{

2622

winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2637

dst_offset_first_element_in_bytes,

2638

src_stride_w,

2639

dst_stride_w);

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

2640

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2641

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

2642

#if defined(SRC_DIM_1) && defined(SRC_DIM_2)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2643

/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2644

*

2645

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2646

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2647

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2648

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2649

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2650

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2651

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2652

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2653

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2654

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2655

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2656

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2657

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2658

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2659

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2660

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2661

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2662

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2663

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2664

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2665

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2666

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2667

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2668

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2669

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2670

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2671

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2672

*/

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2673

__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2674

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2675

TENSOR3D_DECLARATION(dst),

2676

uint src_stride_w,

2677

uint dst_stride_w)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2678

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2679

winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2694

dst_offset_first_element_in_bytes,

2695

src_stride_w,

2696

dst_stride_w);

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2697

}

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2698

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2699

/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC

2700

*

2701

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).

2702

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2703

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

2704

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2705

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2706

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

2707

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2708

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2709

*

Vidhya Sudhan Loganathan

2018-08-31 16:10:16 +0100

[diff] [blame]

2710

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2711

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2712

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2713

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2714

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2715

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2716

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2717

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2718

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2719

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2720

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2721

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2722

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2723

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2724

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2725

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2726

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2727

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2728

*/

2729

__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(

2730

TENSOR3D_DECLARATION(src),

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2731

TENSOR3D_DECLARATION(dst),

2732

uint src_stride_w,

2733

uint dst_stride_w)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2734

{

2735

winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

Georgios Pinitas

2018-10-23 15:23:23 +0100

[diff] [blame]

2750

dst_offset_first_element_in_bytes,

2751

src_stride_w,

2752

dst_stride_w);

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

2753

}

Michele Di Giorgio

2019-02-27 14:26:51 +0000

[diff] [blame^]

2754

2755

/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC

2756

*

2757

* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).

2758

* @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)

2759

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

2760

* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).

2761

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

2762

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7

2763

* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time

2764

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.

2765

*

2766

* @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16

2767

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

2768

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2769

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

2770

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2771

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

2772

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2773

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

2774

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr

2775

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2776

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2777

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2778

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2779

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2780

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

2781

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2782

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

2783

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

2784

*/

2785

__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(

2786

TENSOR3D_DECLARATION(src),

2787

TENSOR3D_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

2807

src_stride_w,

2808

dst_stride_w);

2809

}

Georgios Pinitas

2018-10-29 18:01:52 +0000

[diff] [blame]

2810

#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

2811

#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

Michele Di Giorgio