Blame - src/core/CL/cl_kernels/im2col.cl - ml/ComputeLibrary

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1

45

*

46

* @note This kernel computes 4 elements

47

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

48

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

49

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

50

* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1

51

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

52

*

53

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

54

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

55

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

56

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

57

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

58

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

59

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

60

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

61

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

62

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

63

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

64

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

65

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

66

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

67

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

68

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

69

*/

70

__kernel void im2col1x1_stridex1_dchw(

71

TENSOR3D_DECLARATION(src),

72

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor

77

const uint yc = get_global_id(1); // y coordinate in the convolved tensor

78

const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

79

const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size

80

81

// Clamp xc

82

// The strategy clamps at "xc" as it will be a valid value for sure

83

uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);

84

85

// Check which values are valid

86

const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

87

88

xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));

89

90

// Calculate input indices

91

const uint xi = xc;

92

const uint yi = yc * STRIDE_Y;

93

94

// Calculate output indices

95

const uint xo = ch;

96

const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution

97

98

// Get input and output address

99

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

100

101

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;

102

103

VEC_DATA_TYPE(DATA_TYPE, 4)

104

data = vload4(0, (__global DATA_TYPE *)input_ptr);

105

106

// If out-of-bound, overwrite with the first element

107

data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);

108

109

*(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;

110

*(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;

111

*(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;

112

*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;

113

114

#ifdef HAS_BIAS

115

if(ch == (KERNEL_DEPTH - 1))

116

{

117

*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;

118

*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;

119

*((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;

120

*((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

125

Pablo Tello

4a626a7

2018-04-04 10:01:14 +0100

[diff] [blame^]

126

#define PTR_TO_VALUE(PTR, DATA_TYPE) *((DATA_TYPE *)(PTR))

127

Gian Marco

76faef8

2018-01-29 12:15:32 +0000

[diff] [blame]

128

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

Pablo Tello

4a626a7

2018-04-04 10:01:14 +0100

[diff] [blame^]

129

130

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5

131

*

132

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

133

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

134

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

135

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

136

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

137

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

138

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

139

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

140

*

141

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

142

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

143

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

144

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

145

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

146

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

147

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

148

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

149

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

150

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

151

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

152

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

153

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

154

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

155

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

156

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

157

*/

158

__kernel void im2col_generic_nhwc(

159

TENSOR3D_DECLARATION(src),

160

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int src_stride_y_int = (int)src_stride_y;

165

const int src_stride_z_int = (int)src_stride_z;

166

const int xc = get_global_id(1); // x coordinate in the convolved tensor

167

const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor

168

const int ch = get_global_id(0); // input feature map

169

const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size

170

171

// Calculate input indices

172

const int xi = xc * STRIDE_X - PAD_LEFT;

173

const int yi = yc * STRIDE_Y - PAD_TOP;

174

175

// Calculate output indices

176

const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;

177

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

178

179

// Get input and output address

180

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;

181

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

182

183

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

184

{

185

const int y0 = yi + yk;

186

if(y0 >= 0 && y0 < SRC_HEIGHT)

187

{

188

int xk;

189

for(xk = 0; xk < KERNEL_WIDTH; xk++)

190

{

191

const int x0 = xi + xk;

192

if(x0 >= 0 && x0 < SRC_WIDTH)

193

{

194

*((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + xk * src_stride_y + yk * src_stride_z, DATA_TYPE);

}

else

{

*((__global DATA_TYPE *)output_ptr) = PAD_VALUE;

199

}

200

output_ptr += 1 * sizeof(DATA_TYPE);

}

}

else

{

for(int xk = 0; xk < KERNEL_WIDTH; xk++)

206

{

207

*((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;

208

output_ptr += 1 * dst_stride_x;

}

}

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

214

{

215

*((__global DATA_TYPE *)output_ptr) = 1.0f;

216

output_ptr += 1 * dst_stride_x;

}

#endif // HAS_BIAS

}

/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3

222

*

223

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

224

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

225

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

226

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

227

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

228

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

229

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

230

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

231

*

232

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

233

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

234

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

235

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

236

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

237

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

238

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

239

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

240

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

241

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

242

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

243

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

244

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

245

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

246

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

247

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

248

*/

249

__kernel void im2col3x3_nhwc(

250

TENSOR3D_DECLARATION(src),

251

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int src_stride_y_int = (int)src_stride_y;

256

const int src_stride_z_int = (int)src_stride_z;

257

const int xc = get_global_id(1); // x coordinate in the convolved tensor

258

const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor

259

const int ch = get_global_id(0); // input feature map

260

const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size

261

262

// Calculate input indices

263

const int xi = xc * STRIDE_X - PAD_LEFT;

264

const int yi = yc * STRIDE_Y - PAD_TOP;

265

266

// Calculate output indices

267

const int xo = ch * 9; // 3x3

268

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

269

270

// Get input and output address

271

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;

272

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

273

274

VEC_DATA_TYPE(DATA_TYPE, 3)

275

row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

276

VEC_DATA_TYPE(DATA_TYPE, 3)

277

row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

278

VEC_DATA_TYPE(DATA_TYPE, 3)

279

row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

280

281

const int3 y = (int3)yi + (int3)(0, 1, 2);

282

// Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.

283

if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)

284

{

285

row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

286

PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),

287

PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),

288

PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));

289

}

290

291

if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)

292

{

293

row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

294

PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),

295

PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),

296

PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));

297

}

298

299

if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)

300

{

301

row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

302

PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),

303

PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),

304

PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));

305

}

306

307

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

308

// Put 0 if the value is out-of-bound

309

const int3 x = (int3)xi + (int3)(0, 1, 2);

310

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

311

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

312

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

313

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);

314

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);

315

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

316

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

317

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

318

319

#ifdef HAS_BIAS

320

if(ch == (KERNEL_DEPTH - 1))

321

{

322

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

327

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3

328

*

329

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

330

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

331

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

332

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

333

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

334

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

335

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

336

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

337

*

338

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

339

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

340

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

341

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

342

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

343

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

344

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

345

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

346

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

347

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

348

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

349

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

350

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

351

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

352

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

353

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

354

*/

355

__kernel void im2col3x3_dchw(

356

TENSOR3D_DECLARATION(src),

357

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

362

const int yc = get_global_id(1); // y coordinate in the convolved tensor

363

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

364

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

365

366

// Calculate input indices

367

const int xi = xc * STRIDE_X - PAD_LEFT;

368

const int yi = yc * STRIDE_Y - PAD_TOP;

369

370

// Calculate output indices

371

const int xo = ch * 9; // 3x3

372

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

373

374

// Get input and output address

375

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

376

377

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

378

379

VEC_DATA_TYPE(DATA_TYPE, 3)

380

row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

381

VEC_DATA_TYPE(DATA_TYPE, 3)

382

row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

383

VEC_DATA_TYPE(DATA_TYPE, 3)

384

row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

385

386

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

387

// Put 0 if the value is out-of-bound

388

int3 x = (int3)xi + (int3)(0, 1, 2);

389

int3 y = (int3)yi + (int3)(0, 1, 2);

390

391

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

392

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

393

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

394

cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

395

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

396

cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

397

398

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

399

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);

400

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);

401

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

402

403

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

404

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

405

406

#ifdef HAS_BIAS

407

if(ch == (KERNEL_DEPTH - 1))

408

{

409

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5

415

*

416

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

417

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

418

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

419

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

420

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

421

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

422

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

423

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

424

*

425

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

426

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

427

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

428

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

429

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

430

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

431

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

432

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

433

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

434

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

435

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

436

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

437

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

438

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

439

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

440

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

441

*/

442

__kernel void im2col5x5_dchw(

443

TENSOR3D_DECLARATION(src),

444

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

449

const int yc = get_global_id(1); // y coordinate in the convolved tensor

450

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

451

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

452

453

// Calculate input indices

454

const int xi = xc * STRIDE_X - PAD_LEFT;

455

const int yi = yc * STRIDE_Y - PAD_TOP;

456

457

// Calculate output indices

458

const int xo = ch * 25; // 5x5

459

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

460

461

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

462

// Put 0 if the value is out-of-bound

463

int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);

464

int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);

int x1 = xi + 4;

int y1 = yi + 4;

// Check if we could have out-of-bounds elements in the x direction

469

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

470

x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

471

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

472

y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

473

COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);

474

COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);

475

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

476

477

// Get input and output address

478

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

479

480

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

481

482

{

483

VEC_DATA_TYPE(DATA_TYPE, 4)

484

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

485

DATA_TYPE

486

row01 = *((__global DATA_TYPE *)input_ptr + 4);

487

488

input_ptr += src_stride_y;

489

490

VEC_DATA_TYPE(DATA_TYPE, 4)

491

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

492

DATA_TYPE

493

row11 = *((__global DATA_TYPE *)input_ptr + 4);

494

495

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

496

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

497

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;

498

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

499

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;

500

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);

501

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);

502

503

// Replace with 0 if the value is not valid

504

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

505

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

506

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

507

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

508

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

509

510

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

511

row10.s012),

512

0, (__global DATA_TYPE *)output_ptr);

513

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

514

515

input_ptr += src_stride_y;

516

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

521

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

522

DATA_TYPE

523

row01 = *((__global DATA_TYPE *)input_ptr + 4);

524

525

input_ptr += src_stride_y;

526

527

VEC_DATA_TYPE(DATA_TYPE, 4)

528

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

529

DATA_TYPE

530

row11 = *((__global DATA_TYPE *)input_ptr + 4);

531

532

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

533

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

534

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;

535

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

536

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;

537

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);

538

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);

539

540

// Replace with 0 if the value is not valid

541

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

542

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

543

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

544

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

545

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

546

547

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

548

row10.s012),

549

0, (__global DATA_TYPE *)output_ptr);

550

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

551

552

input_ptr += src_stride_y;

553

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

558

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

559

DATA_TYPE

560

row01 = *((__global DATA_TYPE *)input_ptr + 4);

561

562

input_ptr += src_stride_y;

563

564

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

565

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

566

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;

567

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);

568

569

// Replace with 0 if the value is not valid

570

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

571

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

572

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

573

574

vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);

575

*((__global DATA_TYPE *)output_ptr + 4) = row01;

576

577

output_ptr += 5 * dst_stride_x;

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

582

{

583

*((__global DATA_TYPE *)output_ptr) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

588

589

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

590

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11

591

*

592

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

593

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

594

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

595

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

596

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

597

*

598

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

599

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

600

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

601

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

602

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

603

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

604

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

605

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

606

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

607

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

608

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

609

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

610

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

611

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

612

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

613

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

614

*/

615

__kernel void im2col11x11_padx0_pady0_dchw(

616

TENSOR3D_DECLARATION(src),

617

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

622

const int yc = get_global_id(1); // y coordinate in the convolved tensor

623

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

624

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

625

626

// Calculate input indices

627

const int xi = xc * STRIDE_X;

628

const int yi = yc * STRIDE_Y;

629

630

// Calculate output indices

631

const int xo = ch * 121; // 11x11

632

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

633

634

// Get input and output address

635

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

636

637

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

638

{

639

VEC_DATA_TYPE(DATA_TYPE, 8)

640

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

641

VEC_DATA_TYPE(DATA_TYPE, 3)

642

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

643

644

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

645

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

646

647

input_ptr += src_stride_y;

648

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

653

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

654

VEC_DATA_TYPE(DATA_TYPE, 3)

655

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

656

657

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

658

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

659

660

input_ptr += src_stride_y;

661

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

666

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

667

VEC_DATA_TYPE(DATA_TYPE, 3)

668

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

669

670

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

671

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

672

673

input_ptr += src_stride_y;

674

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

679

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

680

VEC_DATA_TYPE(DATA_TYPE, 3)

681

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

682

683

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

684

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

685

686

input_ptr += src_stride_y;

687

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

692

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

693

VEC_DATA_TYPE(DATA_TYPE, 3)

694

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

695

696

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

697

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

698

699

input_ptr += src_stride_y;

700

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

705

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

706

VEC_DATA_TYPE(DATA_TYPE, 3)

707

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

708

709

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

710

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

711

712

input_ptr += src_stride_y;

713

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

718

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

719

VEC_DATA_TYPE(DATA_TYPE, 3)

720

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

721

722

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

723

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

724

725

input_ptr += src_stride_y;

726

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

731

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

732

VEC_DATA_TYPE(DATA_TYPE, 3)

733

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

734

735

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

736

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

737

738

input_ptr += src_stride_y;

739

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

744

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

745

VEC_DATA_TYPE(DATA_TYPE, 3)

746

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

747

748

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

749

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

750

751

input_ptr += src_stride_y;

752

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

757

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

758

VEC_DATA_TYPE(DATA_TYPE, 3)

759

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

760

761

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

762

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

763

764

input_ptr += src_stride_y;

765

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

770

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

771

VEC_DATA_TYPE(DATA_TYPE, 3)

772

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

773

774

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

775

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

776

777

output_ptr += 11 * src_stride_x;

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

782

{

783

*((__global DATA_TYPE *)output_ptr) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

788

#endif // !defined(FIXED_POINT_POSITION)

789

790

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

791

/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when

792

* the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.

793

*

794

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.

795

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.

796

* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.

797

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

798

*

799

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

800

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

801

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

802

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

803

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

804

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

805

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

806

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

807

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

808

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

809

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

810

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

811

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

812

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

813

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

814

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

815

*/

816

__kernel void im2col_generic_padx0_pady0_dchw(

817

TENSOR3D_DECLARATION(src),

818

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

823

const int yc = get_global_id(1); // y coordinate in the convolved tensor

824

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

825

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

826

827

// Calculate input indices

828

const int xi = xc * STRIDE_X;

829

const int yi = yc * STRIDE_Y;

830

// Calculate output indices

831

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

832

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

833

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

834

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

835

// Linearize convolution elements

836

for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)

837

{

838

int last_x = 0;

839

for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)

840

{

841

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

842

row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

843

VSTORE(VECTOR_SIZE)

844

(row, 0, output_ptr);

845

last_x = x;

846

}

847

// Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).

848

// Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.

849

#if WIDTH_MOD_VECTOR_SIZE == 1

850

*output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

851

#elif WIDTH_MOD_VECTOR_SIZE > 1

852

VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)

853

row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

854

VSTORE(WIDTH_MOD_VECTOR_SIZE)

855

(row, 0, output_ptr);

856

#endif /* WIDTH_MOD_VECTOR_SIZE */

857

output_ptr += WIDTH_MOD_VECTOR_SIZE;

858

} /* End of loop over KERNEL_HEIGHT */

859

860

#ifdef HAS_BIAS

861

if(ch == (KERNEL_DEPTH - 1))

862

{

863

#ifdef FIXED_POINT_POSITION

864

*output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);

865

#else // FIXED_POINT_POSITION

866

*output_ptr = 1.0f;

867

#endif // FIXED_POINT_POSITION

}

#endif // HAS_BIAS

}

#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

872

873

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

874

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.

875

*

876

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

877

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

878

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

879

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64

880

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

881

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

882

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

883

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Gian Marco

76faef8

2018-01-29 12:15:32 +0000

[diff] [blame]

884

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

885

*

886

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

887

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

888

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

889

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

890

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

891

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

892

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

893

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

894

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

895

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

896

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

897

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

898

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

899

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

900

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

901

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

902

*/

903

__kernel void im2col_generic_dchw(

904

TENSOR3D_DECLARATION(src),

905

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

910

const int yc = get_global_id(1); // y coordinate in the convolved tensor

911

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

912

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

913

914

// Calculate input indices

915

const int xi = xc * STRIDE_X - PAD_LEFT;

916

const int yi = yc * STRIDE_Y - PAD_TOP;

917

918

// Calculate output indices

919

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

920

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

921

922

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

923

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

924

925

// Linearize convolution elements

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

926

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

Gian Marco

76faef8

2018-01-29 12:15:32 +0000

[diff] [blame]

927

{

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

928

int y = yi + yk * DILATION_Y;

929

for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)

Gian Marco

76faef8

2018-01-29 12:15:32 +0000

[diff] [blame]

930

{

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

931

int x = xi + xk * DILATION_X;

Gian Marco

76faef8

2018-01-29 12:15:32 +0000

[diff] [blame]

932

#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

933

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

934

#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

935

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

936

{

937

*output_ptr = PAD_VALUE;

}

else

{

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

942

}

943

#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

}

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

949

{

950

#ifdef FIXED_POINT_POSITION

951

*output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);

952

#else // FIXED_POINT_POSITION

953

*output_ptr = 1.0f;

954

#endif // FIXED_POINT_POSITION

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

959

960

/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when

961

* the kernel width and height are the same of width and height of the input tensor

962

*

963

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

964

* @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

965

*

966

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32

967

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

968

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

969

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

970

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

971

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

972

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

973

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

974

* @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr

975

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

976

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

977

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

978

* @param[in] width The width of the input tensor

979

* @param[in] height The height of the input tensor

980

*/

981

__kernel void im2col_reduced_dchw(

982

TENSOR3D_DECLARATION(src),

983

VECTOR_DECLARATION(dst),

984

uint width, uint height)

985

{

986

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

987

988

const uint image_size = width * height;

989

990

__global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;

991

992