Blame - src/core/CL/cl_kernels/im2col.cl - ml/ComputeLibrary

2018-07-04 09:34:00 +0100

[diff] [blame]

48

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

49

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

50

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

51

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

52

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

53

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

54

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

55

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

56

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

57

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

58

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

59

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

60

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

63

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

64

*/

65

__kernel void im2col1x1_stridex1_dchw(

66

TENSOR3D_DECLARATION(src),

67

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor

72

const uint yc = get_global_id(1); // y coordinate in the convolved tensor

73

const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

74

const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size

75

76

// Clamp xc

77

// The strategy clamps at "xc" as it will be a valid value for sure

78

uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);

79

80

// Check which values are valid

81

const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

82

83

xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));

84

85

// Calculate input indices

86

const uint xi = xc;

87

const uint yi = yc * STRIDE_Y;

88

89

// Calculate output indices

90

const uint xo = ch;

91

const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution

92

93

// Get input and output address

94

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

95

96

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;

97

98

VEC_DATA_TYPE(DATA_TYPE, 4)

99

data = vload4(0, (__global DATA_TYPE *)input_ptr);

100

101

// If out-of-bound, overwrite with the first element

102

data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);

103

104

*(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;

105

*(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;

106

*(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;

107

*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;

108

109

#ifdef HAS_BIAS

110

if(ch == (KERNEL_DEPTH - 1))

111

{

112

*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;

113

*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;

114

*((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;

115

*((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

120

Michele Di Giorgio

14bd2cf

2018-06-14 14:55:14 +0100

[diff] [blame]

121

#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

122

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

123

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

124

125

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5

126

*

127

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

128

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

129

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

130

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

131

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

132

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

133

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Georgios Pinitas

2018-06-19 13:09:53 +0100

[diff] [blame]

134

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

135

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

136

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

137

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

138

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

139

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

140

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

141

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

142

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

143

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

144

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

145

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

146

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

147

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

148

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

149

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

150

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

151

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

152

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

153

*/

154

__kernel void im2col_generic_nhwc(

155

TENSOR3D_DECLARATION(src),

156

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int src_stride_y_int = (int)src_stride_y;

161

const int src_stride_z_int = (int)src_stride_z;

162

const int xc = get_global_id(1); // x coordinate in the convolved tensor

163

const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor

164

const int ch = get_global_id(0); // input feature map

165

const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size

166

167

// Calculate input indices

168

const int xi = xc * STRIDE_X - PAD_LEFT;

169

const int yi = yc * STRIDE_Y - PAD_TOP;

170

171

// Calculate output indices

172

const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;

173

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

174

175

// Get input and output address

176

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;

177

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

178

179

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

180

{

Georgios Pinitas

2018-06-19 13:09:53 +0100

[diff] [blame]

181

const int dilated_offset_y = yk * DILATION_Y;

182

const int y0 = yi + dilated_offset_y;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

183

if(y0 >= 0 && y0 < SRC_HEIGHT)

184

{

185

int xk;

186

for(xk = 0; xk < KERNEL_WIDTH; xk++)

187

{

Georgios Pinitas

2018-06-19 13:09:53 +0100

[diff] [blame]

188

const int dilated_offset_x = xk * DILATION_X;

189

const int x0 = xi + dilated_offset_x;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

190

if(x0 >= 0 && x0 < SRC_WIDTH)

191

{

Georgios Pinitas

2018-06-19 13:09:53 +0100

[diff] [blame]

192

*((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + dilated_offset_x * src_stride_y + dilated_offset_y * src_stride_z, DATA_TYPE);

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

}

else

{

*((__global DATA_TYPE *)output_ptr) = PAD_VALUE;

197

}

198

output_ptr += 1 * sizeof(DATA_TYPE);

}

}

else

{

for(int xk = 0; xk < KERNEL_WIDTH; xk++)

204

{

205

*((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;

206

output_ptr += 1 * dst_stride_x;

}

}

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

212

{

213

*((__global DATA_TYPE *)output_ptr) = 1.0f;

214

output_ptr += 1 * dst_stride_x;

}

#endif // HAS_BIAS

}

/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3

220

*

221

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

222

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

223

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

224

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

225

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

226

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

227

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

228

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

229

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

230

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

231

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

232

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

233

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

234

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

235

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

236

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

237

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

238

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

239

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

240

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

241

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

242

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

243

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

244

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

245

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

246

*/

247

__kernel void im2col3x3_nhwc(

248

TENSOR3D_DECLARATION(src),

249

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int src_stride_y_int = (int)src_stride_y;

254

const int src_stride_z_int = (int)src_stride_z;

255

const int xc = get_global_id(1); // x coordinate in the convolved tensor

256

const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor

257

const int ch = get_global_id(0); // input feature map

258

const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size

259

260

// Calculate input indices

261

const int xi = xc * STRIDE_X - PAD_LEFT;

262

const int yi = yc * STRIDE_Y - PAD_TOP;

263

264

// Calculate output indices

265

const int xo = ch * 9; // 3x3

266

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

267

268

// Get input and output address

269

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;

270

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

271

272

VEC_DATA_TYPE(DATA_TYPE, 3)

273

row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

274

VEC_DATA_TYPE(DATA_TYPE, 3)

275

row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

276

VEC_DATA_TYPE(DATA_TYPE, 3)

277

row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);

278

279

const int3 y = (int3)yi + (int3)(0, 1, 2);

280

// Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.

281

if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)

282

{

283

row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

284

PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),

285

PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),

286

PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));

287

}

288

289

if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)

290

{

291

row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

292

PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),

293

PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),

294

PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));

295

}

296

297

if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)

298

{

299

row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(

300

PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),

301

PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),

302

PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));

303

}

304

305

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

306

// Put 0 if the value is out-of-bound

307

const int3 x = (int3)xi + (int3)(0, 1, 2);

308

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

309

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

310

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

311

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);

312

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);

313

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

314

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

315

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

316

317

#ifdef HAS_BIAS

318

if(ch == (KERNEL_DEPTH - 1))

319

{

320

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

325

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3

326

*

327

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

328

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

329

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

330

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

331

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

332

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

333

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

334

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

335

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

336

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

337

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

338

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

339

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

340

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

341

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

342

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

343

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

344

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

345

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

346

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

347

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

348

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

349

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

350

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

351

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

352

*/

353

__kernel void im2col3x3_dchw(

354

TENSOR3D_DECLARATION(src),

355

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

360

const int yc = get_global_id(1); // y coordinate in the convolved tensor

361

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

362

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

363

364

// Calculate input indices

365

const int xi = xc * STRIDE_X - PAD_LEFT;

366

const int yi = yc * STRIDE_Y - PAD_TOP;

367

368

// Calculate output indices

369

const int xo = ch * 9; // 3x3

370

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

371

372

// Get input and output address

373

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

374

375

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

376

377

VEC_DATA_TYPE(DATA_TYPE, 3)

378

row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

379

VEC_DATA_TYPE(DATA_TYPE, 3)

380

row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

381

VEC_DATA_TYPE(DATA_TYPE, 3)

382

row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

383

384

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

385

// Put 0 if the value is out-of-bound

386

int3 x = (int3)xi + (int3)(0, 1, 2);

387

int3 y = (int3)yi + (int3)(0, 1, 2);

388

389

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

390

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

391

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

392

cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

393

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

394

cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

395

396

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

397

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);

398

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);

399

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

400

401

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

402

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

403

404

#ifdef HAS_BIAS

405

if(ch == (KERNEL_DEPTH - 1))

406

{

407

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5

413

*

414

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

415

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

416

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

417

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

418

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

419

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

420

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

421

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

422

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

423

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

424

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

425

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

427

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

429

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

431

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

432

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

433

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

434

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

435

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

436

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

437

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

438

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

439

*/

440

__kernel void im2col5x5_dchw(

441

TENSOR3D_DECLARATION(src),

442

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

447

const int yc = get_global_id(1); // y coordinate in the convolved tensor

448

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

449

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

450

451

// Calculate input indices

452

const int xi = xc * STRIDE_X - PAD_LEFT;

453

const int yi = yc * STRIDE_Y - PAD_TOP;

454

455

// Calculate output indices

456

const int xo = ch * 25; // 5x5

457

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

458

459

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

460

// Put 0 if the value is out-of-bound

461

int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);

462

int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);

int x1 = xi + 4;

int y1 = yi + 4;

// Check if we could have out-of-bounds elements in the x direction

467

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

468

x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

469

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

470

y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

471

COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);

472

COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);

473

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

474

475

// Get input and output address

476

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

477

478

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

479

480

{

481

VEC_DATA_TYPE(DATA_TYPE, 4)

482

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

483

DATA_TYPE

484

row01 = *((__global DATA_TYPE *)input_ptr + 4);

485

486

input_ptr += src_stride_y;

487

488

VEC_DATA_TYPE(DATA_TYPE, 4)

489

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

490

DATA_TYPE

491

row11 = *((__global DATA_TYPE *)input_ptr + 4);

492

493

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

494

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

495

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;

496

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

497

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;

498

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);

499

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);

500

501

// Replace with 0 if the value is not valid

502

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

503

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

504

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

505

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

506

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

507

508

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

509

row10.s012),

510

0, (__global DATA_TYPE *)output_ptr);

511

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

512

513

input_ptr += src_stride_y;

514

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

519

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

520

DATA_TYPE

521

row01 = *((__global DATA_TYPE *)input_ptr + 4);

522

523

input_ptr += src_stride_y;

524

525

VEC_DATA_TYPE(DATA_TYPE, 4)

526

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

527

DATA_TYPE

528

row11 = *((__global DATA_TYPE *)input_ptr + 4);

529

530

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

531

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

532

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;

533

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

534

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;

535

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);

536

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);

537

538

// Replace with 0 if the value is not valid

539

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

540

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

541

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

542

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

543

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

544

545

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

546

row10.s012),

547

0, (__global DATA_TYPE *)output_ptr);

548

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

549

550

input_ptr += src_stride_y;

551

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

556

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

557

DATA_TYPE

558

row01 = *((__global DATA_TYPE *)input_ptr + 4);

559

560

input_ptr += src_stride_y;

561

562

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

563

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

564

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;

565

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);

566

567

// Replace with 0 if the value is not valid

568

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

569

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

570

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

571

572

vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);

573

*((__global DATA_TYPE *)output_ptr + 4) = row01;

574

575

output_ptr += 5 * dst_stride_x;

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

580

{

581

*((__global DATA_TYPE *)output_ptr) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

586

587

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

588

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11

589

*

590

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

591

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

592

* @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3

593

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

594

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

595

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

596

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

597

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

598

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

599

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

600

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

601

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

602

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

603

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

604

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

605

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

606

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

607

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

608

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

609

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

610

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

611

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

612

*/

613

__kernel void im2col11x11_padx0_pady0_dchw(

614

TENSOR3D_DECLARATION(src),

615

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

620

const int yc = get_global_id(1); // y coordinate in the convolved tensor

621

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

622

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

623

624

// Calculate input indices

625

const int xi = xc * STRIDE_X;

626

const int yi = yc * STRIDE_Y;

627

628

// Calculate output indices

629

const int xo = ch * 121; // 11x11

630

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

631

632

// Get input and output address

633

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

634

635

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

636

{

637

VEC_DATA_TYPE(DATA_TYPE, 8)

638

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

639

VEC_DATA_TYPE(DATA_TYPE, 3)

640

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

641

642

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

643

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

644

645

input_ptr += src_stride_y;

646

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

651

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

652

VEC_DATA_TYPE(DATA_TYPE, 3)

653

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

654

655

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

656

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

657

658

input_ptr += src_stride_y;

659

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

664

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

665

VEC_DATA_TYPE(DATA_TYPE, 3)

666

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

667

668

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

669

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

670

671

input_ptr += src_stride_y;

672

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

677

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

678

VEC_DATA_TYPE(DATA_TYPE, 3)

679

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

680

681

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

682

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

683

684

input_ptr += src_stride_y;

685

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

690

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

691

VEC_DATA_TYPE(DATA_TYPE, 3)

692

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

693

694

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

695

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

696

697

input_ptr += src_stride_y;

698

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

703

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

704

VEC_DATA_TYPE(DATA_TYPE, 3)

705

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

706

707

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

708

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

709

710

input_ptr += src_stride_y;

711

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

716

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

717

VEC_DATA_TYPE(DATA_TYPE, 3)

718

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

719

720

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

721

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

722

723

input_ptr += src_stride_y;

724

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

729

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

730

VEC_DATA_TYPE(DATA_TYPE, 3)

731

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

732

733

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

734

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

735

736

input_ptr += src_stride_y;

737

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

742

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

743

VEC_DATA_TYPE(DATA_TYPE, 3)

744

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

745

746

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

747

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

748

749

input_ptr += src_stride_y;

750

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

755

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

756

VEC_DATA_TYPE(DATA_TYPE, 3)

757

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

758

759

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

760

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

761

762

input_ptr += src_stride_y;

763

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

768

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

769

VEC_DATA_TYPE(DATA_TYPE, 3)

770

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

771

772

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

773

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

774

775

output_ptr += 11 * src_stride_x;

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

780

{

781

*((__global DATA_TYPE *)output_ptr) = 1.0f;

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

786

787

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

788

/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when

789

* the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.

790

*

791

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.

792

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.

793

* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.

794

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

795

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

796

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

797

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

798

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

799

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

800

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

801

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

802

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

803

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

804

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

805

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

806

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

807

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

808

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

809

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

810

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

811

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

812

*/

813

__kernel void im2col_generic_padx0_pady0_dchw(

814

TENSOR3D_DECLARATION(src),

815

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

820

const int yc = get_global_id(1); // y coordinate in the convolved tensor

821

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

822

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

823

824

// Calculate input indices

825

const int xi = xc * STRIDE_X;

826

const int yi = yc * STRIDE_Y;

827

// Calculate output indices

828

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

829

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

830

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

831

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

832

// Linearize convolution elements

833

for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)

834

{

835

int last_x = 0;

836

for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)

837

{

838

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

839

row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

840

VSTORE(VECTOR_SIZE)

841

(row, 0, output_ptr);

842

last_x = x;

843

}

844

// Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).

845

// Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.

846

#if WIDTH_MOD_VECTOR_SIZE == 1

847

*output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

848

#elif WIDTH_MOD_VECTOR_SIZE > 1

849

VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)

850

row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

851

VSTORE(WIDTH_MOD_VECTOR_SIZE)

852

(row, 0, output_ptr);

853

#endif /* WIDTH_MOD_VECTOR_SIZE */

854

output_ptr += WIDTH_MOD_VECTOR_SIZE;

855

} /* End of loop over KERNEL_HEIGHT */

856

857

#ifdef HAS_BIAS

858

if(ch == (KERNEL_DEPTH - 1))

859

{

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

860

*output_ptr = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

}

#endif // HAS_BIAS

}

#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

865

866

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

867

/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.

868

*

869

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

870

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

871

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

872

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64

873

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

874

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

875

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Alex Gilday

2018-03-23 14:16:00 +0000

[diff] [blame]

876

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

877

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

878

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

879

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

880

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

881

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

882

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

883

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

884

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

885

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

886

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

887

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

888

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

889

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

890

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

891

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

892

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

893

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

894

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

895

*/

896

__kernel void im2col_generic_dchw(

897

TENSOR3D_DECLARATION(src),

898

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int xc = get_global_id(0); // x coordinate in the convolved tensor

903

const int yc = get_global_id(1); // y coordinate in the convolved tensor

904

const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map

905

const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size

906

907

// Calculate input indices

908

const int xi = xc * STRIDE_X - PAD_LEFT;

909

const int yi = yc * STRIDE_Y - PAD_TOP;

910

911

// Calculate output indices

912

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

913

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

914

915

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

916

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

917

918

// Linearize convolution elements

Alex Gilday

2018-03-23 14:16:00 +0000

[diff] [blame]

919

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

920

{

Alex Gilday

2018-03-23 14:16:00 +0000

[diff] [blame]

921

int y = yi + yk * DILATION_Y;

922

for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

923

{

Alex Gilday

2018-03-23 14:16:00 +0000

[diff] [blame]

924

int x = xi + xk * DILATION_X;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

925

#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

926

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

927

#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

928

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

929

{

930

*output_ptr = PAD_VALUE;

}

else

{

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

935

}

936

#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

}

}

#ifdef HAS_BIAS

if(ch == (KERNEL_DEPTH - 1))

942

{

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

943

*output_ptr = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

}

#endif // HAS_BIAS

}

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

948

949

/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when

950

* the kernel width and height are the same of width and height of the input tensor

951

*

952

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

953

* @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

954

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

955

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco