Blame - src/core/CL/cl_kernels/im2col.cl - ml/ComputeLibrary

2018-06-28 16:29:29 +0100

[diff] [blame^]

109

if(ch == (SRC_DEPTH - 1))

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

110

{

111

*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;

112

*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;

113

*((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;

114

*((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;

115

}

116

#endif // HAS_BIAS

117

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

118

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

119

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

120

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

121

#if defined(DILATION_X) && defined(DILATION_Y)

122

/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

123

*

124

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

125

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

126

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

127

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

128

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

129

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

130

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Georgios Pinitas

19ea419

2018-06-19 13:09:53 +0100

[diff] [blame]

131

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

132

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

133

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

134

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

135

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

136

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

137

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

138

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

139

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

140

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

141

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

142

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

143

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

144

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

145

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

146

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

147

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

148

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

149

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

150

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

151

__kernel void im2col_generic_nchw(

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

152

TENSOR3D_DECLARATION(src),

153

IMAGE_DECLARATION(dst),

154

uint src_stride_w,

155

uint dst_stride_w)

156

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

157

const int xc = get_global_id(0); // x coordinate in the convolved tensor

158

const int yc = get_global_id(1); // y coordinate in the convolved tensor

159

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

160

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

161

162

// Calculate input indices

163

const int xi = xc * STRIDE_X - PAD_LEFT;

164

const int yi = yc * STRIDE_Y - PAD_TOP;

165

166

// Calculate output indices

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

167

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

168

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

169

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

170

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

171

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

172

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

173

// Linearize convolution elements

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

174

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

175

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

176

int y = yi + yk * DILATION_Y;

177

for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

178

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

179

int x = xi + xk * DILATION_X;

180

#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

181

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

182

#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

183

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

184

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

185

*output_ptr = PAD_VALUE;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

186

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

187

else

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

188

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

189

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

190

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

191

#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

192

}

193

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

194

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

195

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

196

if(ch == (SRC_DEPTH - 1))

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

197

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

198

*output_ptr = 1.0f;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

199

}

200

#endif // HAS_BIAS

201

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

202

#endif // defined(DILATION_X) && defined(DILATION_Y)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

203

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

204

/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

205

*

206

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

207

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

208

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

209

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

210

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

211

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

212

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

213

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

214

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

215

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

216

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

217

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

219

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

221

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

222

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

223

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

224

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

225

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

226

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

227

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

228

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

229

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

230

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

231

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

232

__kernel void im2col3x3_nchw(

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

233

TENSOR3D_DECLARATION(src),

234

IMAGE_DECLARATION(dst),

235

uint src_stride_w,

236

uint dst_stride_w)

237

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

238

const int xc = get_global_id(0); // x coordinate in the convolved tensor

239

const int yc = get_global_id(1); // y coordinate in the convolved tensor

240

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

241

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

242

243

// Calculate input indices

244

const int xi = xc * STRIDE_X - PAD_LEFT;

245

const int yi = yc * STRIDE_Y - PAD_TOP;

246

247

// Calculate output indices

248

const int xo = ch * 9; // 3x3

249

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

250

251

// Get input and output address

252

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

253

254

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

255

256

VEC_DATA_TYPE(DATA_TYPE, 3)

257

row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

258

VEC_DATA_TYPE(DATA_TYPE, 3)

259

row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

260

VEC_DATA_TYPE(DATA_TYPE, 3)

261

row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

262

263

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

264

// Put 0 if the value is out-of-bound

265

int3 x = (int3)xi + (int3)(0, 1, 2);

266

int3 y = (int3)yi + (int3)(0, 1, 2);

267

268

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

269

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

270

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

271

cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

272

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

273

cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

274

275

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

276

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);

277

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);

278

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

279

280

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

281

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

282

283

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

284

if(ch == (SRC_DEPTH - 1))

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

285

{

286

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

291

/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

292

*

293

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

294

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

295

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

296

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

297

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

298

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

299

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

300

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

301

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

302

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

303

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

304

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

305

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

306

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

307

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

308

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

309

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

310

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

311

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

312

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

313

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

314

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

315

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

316

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

317

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

318

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

319

__kernel void im2col5x5_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

320

TENSOR3D_DECLARATION(src),

321

IMAGE_DECLARATION(dst),

322

uint src_stride_w,

323

uint dst_stride_w)

324

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

325

const int xc = get_global_id(0); // x coordinate in the convolved tensor

326

const int yc = get_global_id(1); // y coordinate in the convolved tensor

327

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

328

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

329

330

// Calculate input indices

331

const int xi = xc * STRIDE_X - PAD_LEFT;

332

const int yi = yc * STRIDE_Y - PAD_TOP;

333

334

// Calculate output indices

335

const int xo = ch * 25; // 5x5

336

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

337

338

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

339

// Put 0 if the value is out-of-bound

340

int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);

341

int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);

int x1 = xi + 4;

int y1 = yi + 4;

// Check if we could have out-of-bounds elements in the x direction

346

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

347

x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

348

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

349

y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

350

COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);

351

COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);

352

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

353

354

// Get input and output address

355

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

356

357

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

358

359

{

360

VEC_DATA_TYPE(DATA_TYPE, 4)

361

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

362

DATA_TYPE

363

row01 = *((__global DATA_TYPE *)input_ptr + 4);

364

365

input_ptr += src_stride_y;

366

367

VEC_DATA_TYPE(DATA_TYPE, 4)

368

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

369

DATA_TYPE

370

row11 = *((__global DATA_TYPE *)input_ptr + 4);

371

372

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

373

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

374

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;

375

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

376

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;

377

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);

378

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);

379

380

// Replace with 0 if the value is not valid

381

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

382

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

383

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

384

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

385

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

386

387

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

388

row10.s012),

389

0, (__global DATA_TYPE *)output_ptr);

390

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

391

392

input_ptr += src_stride_y;

393

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

398

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

399

DATA_TYPE

400

row01 = *((__global DATA_TYPE *)input_ptr + 4);

401

402

input_ptr += src_stride_y;

403

404

VEC_DATA_TYPE(DATA_TYPE, 4)

405

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

406

DATA_TYPE

407

row11 = *((__global DATA_TYPE *)input_ptr + 4);

408

409

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

410

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

411

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;

412

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

413

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;

414

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);

415

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);

416

417

// Replace with 0 if the value is not valid

418

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

419

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

420

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

421

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

422

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

423

424

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

425

row10.s012),

426

0, (__global DATA_TYPE *)output_ptr);

427

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

428

429

input_ptr += src_stride_y;

430

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

435

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

436

DATA_TYPE

437

row01 = *((__global DATA_TYPE *)input_ptr + 4);

438

439

input_ptr += src_stride_y;

440

441

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

442

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

443

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;

444

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);

445

446

// Replace with 0 if the value is not valid

447

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

448

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

449

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

450

451

vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);

452

*((__global DATA_TYPE *)output_ptr + 4) = row01;

453

454

output_ptr += 5 * dst_stride_x;

455

}

456

457

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

458

if(ch == (SRC_DEPTH - 1))

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

459

{

460

*((__global DATA_TYPE *)output_ptr) = 1.0f;

461

}

462

#endif // HAS_BIAS

463

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

464

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

465

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

466

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)

467

/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

468

*

469

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

470

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

471

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

472

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

473

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

474

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

475

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

476

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

477

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

478

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

479

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

480

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

481

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

482

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

483

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

484

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

485

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

486

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

487

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

488

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

489

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

490

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

491

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

492

__kernel void im2col11x11_padx0_pady0_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

493

TENSOR3D_DECLARATION(src),

494

IMAGE_DECLARATION(dst),

495

uint src_stride_w,

496

uint dst_stride_w)

497

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

498

const int xc = get_global_id(0); // x coordinate in the convolved tensor

499

const int yc = get_global_id(1); // y coordinate in the convolved tensor

500

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

501

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

502

503

// Calculate input indices

504

const int xi = xc * STRIDE_X;

505

const int yi = yc * STRIDE_Y;

506

507

// Calculate output indices

508

const int xo = ch * 121; // 11x11

509

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

510

511

// Get input and output address

512

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

513

514

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

515

{

516

VEC_DATA_TYPE(DATA_TYPE, 8)

517

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

518

VEC_DATA_TYPE(DATA_TYPE, 3)

519

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

520

521

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

522

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

523

524

input_ptr += src_stride_y;

525

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

530

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

531

VEC_DATA_TYPE(DATA_TYPE, 3)

532

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

533

534

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

535

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

536

537

input_ptr += src_stride_y;

538

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

543

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

544

VEC_DATA_TYPE(DATA_TYPE, 3)

545

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

546

547

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

548

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

549

550

input_ptr += src_stride_y;

551

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

556

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

557

VEC_DATA_TYPE(DATA_TYPE, 3)

558

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

559

560

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

561

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

562

563

input_ptr += src_stride_y;

564

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

569

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

570

VEC_DATA_TYPE(DATA_TYPE, 3)

571

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

572

573

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

574

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

575

576

input_ptr += src_stride_y;

577

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

582

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

583

VEC_DATA_TYPE(DATA_TYPE, 3)

584

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

585

586

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

587

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

588

589

input_ptr += src_stride_y;

590

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

595

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

596

VEC_DATA_TYPE(DATA_TYPE, 3)

597

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

598

599

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

600

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

601

602

input_ptr += src_stride_y;

603

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

608

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

609

VEC_DATA_TYPE(DATA_TYPE, 3)

610

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

611

612

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

613

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

614

615

input_ptr += src_stride_y;

616

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

621

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

622

VEC_DATA_TYPE(DATA_TYPE, 3)

623

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

624

625

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

626

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

627

628

input_ptr += src_stride_y;

629

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

634

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

635

VEC_DATA_TYPE(DATA_TYPE, 3)

636

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

637

638

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

639

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

640

641

input_ptr += src_stride_y;

642

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

647

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

648

VEC_DATA_TYPE(DATA_TYPE, 3)

649

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

650

651

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

652

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

653

654

output_ptr += 11 * src_stride_x;

655

}

656

657

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

658

if(ch == (SRC_DEPTH - 1))

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

659

{

660

*((__global DATA_TYPE *)output_ptr) = 1.0f;

661

}

662

#endif // HAS_BIAS

663

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

664

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

665

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

666

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

667

/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

668

*

669

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.

670

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.

671

* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

672

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

673

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

674

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

675

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

676

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

677

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

678

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

679

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

680

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

681

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

682

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

683

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

684

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

685

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

686

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

687

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

688

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

689

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

690

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

691

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

692

__kernel void im2col_generic_padx0_pady0_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

693

TENSOR3D_DECLARATION(src),

694

IMAGE_DECLARATION(dst),

695

uint src_stride_w,

696

uint dst_stride_w)

697

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

698

const int xc = get_global_id(0); // x coordinate in the convolved tensor

699

const int yc = get_global_id(1); // y coordinate in the convolved tensor

700

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

701

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

702

703

// Calculate input indices

704

const int xi = xc * STRIDE_X;

705

const int yi = yc * STRIDE_Y;

706

// Calculate output indices

707

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

708

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

709

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

710

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

711

// Linearize convolution elements

712

for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)

713

{

714

int last_x = 0;

715

for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)

716

{

717

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

718

row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

719

VSTORE(VECTOR_SIZE)

720

(row, 0, output_ptr);

721

last_x = x;

722

}

723

// Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).

724

// Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.

725

#if WIDTH_MOD_VECTOR_SIZE == 1

726

*output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

727

#elif WIDTH_MOD_VECTOR_SIZE > 1

728

VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)

729

row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

730

VSTORE(WIDTH_MOD_VECTOR_SIZE)

731

(row, 0, output_ptr);

732

#endif /* WIDTH_MOD_VECTOR_SIZE */

733

output_ptr += WIDTH_MOD_VECTOR_SIZE;

734

} /* End of loop over KERNEL_HEIGHT */

735

736

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

737

if(ch == (SRC_DEPTH - 1))

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

738

{

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

739

*output_ptr = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

740

}

741

#endif // HAS_BIAS

742

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

743

#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

744

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

745

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)

746

747

#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

748

749

/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC

750

*

751

* @note This kernel computes VECTOR_SIZE elements

752

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

753

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

754

* @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

755

* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1

756

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

757

*

758

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

759

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

760

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

761

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

762

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

763

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

764

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

765

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

766

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

767

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

768

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

769

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

770

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

771

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

772

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

773

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

774

*/

775

__kernel void im2col3x3_nhwc(

776

TENSOR3D_DECLARATION(src),

777

IMAGE_DECLARATION(dst),

uint src_stride_w,

uint dst_stride_w)

{

const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map

782

const int yo = get_global_id(1);

783

const int batch = get_global_id(2); // batch size

784

785

// Calculate input indices

786

const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;

787

const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;

788

789

// Get input and output address

790

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;

791

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;

int yi_coord = 0;

int3 offset = 0;

// Clamp xi

int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);

798

#if PAD_TOP != 0 || PAD_BOTTOM != 0

799

#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)

800

xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));

801

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

802

xi_offset *= (int3)src_stride_y;

803

804

// Out-of-bound condition for X

805

int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);

// yi == 0

// Clamp yi

// yi_coord is casted to unsigned int in order to use just a min() operation

810

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

811

yi_coord = yi - (int)PAD_TOP;

812

813

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

814

#if PAD_TOP != 0 || PAD_BOTTOM != 0

815

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

816

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

817

818

// Compute offset

819

offset = xi_offset + (yi_coord * (int)src_stride_z);

820

821

// Load input values

822

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

823

VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

824

VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

825

826

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

827

// Replace invalid values with PAD_VALUE

828

int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));

829

values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));

830

values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));

831

values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));

832

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

833

834

// yi == 1

835

// Clamp yi_coord (it can be negative if PAD_TOP > 1)

836

yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;

837

838

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

839

#if PAD_TOP != 0 || PAD_BOTTOM != 0

840

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

841

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

842

843

// Compute offset

844

offset = xi_offset + (yi_coord * (int)src_stride_z);

845

846

// Load input values

847

VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

848

VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

849

VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

850

851

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

852

// Replace invalid values with zeros

853

y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT));

854

values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));

855

values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));

856

values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));

857

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

// yi == 2

// Clamp yi_coord

yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;

862

863

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

864

#if PAD_TOP != 0 || PAD_BOTTOM != 0

865

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

866

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

867

868

// Compute offset

869

offset = xi_offset + (yi_coord * (int)src_stride_z);

870

871

// Load input values

872

VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

873

VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

874

VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

875

876

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

877

// Replace invalid values with PAD_VALUE

878

y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT));

879

values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));

880

values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));

881

values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));

882

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

// Store

VSTORE(VECTOR_SIZE)

(values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);

887

VSTORE(VECTOR_SIZE)

888

(values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);

889

VSTORE(VECTOR_SIZE)

890

(values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);

891

VSTORE(VECTOR_SIZE)

892

(values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);

893

VSTORE(VECTOR_SIZE)

894

(values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);

895

VSTORE(VECTOR_SIZE)

896

(values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);

897

VSTORE(VECTOR_SIZE)

898

(values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);

899

VSTORE(VECTOR_SIZE)

900

(values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);

901

VSTORE(VECTOR_SIZE)

902

(values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);

903

904

#ifdef HAS_BIAS

905

if((ch + VECTOR_SIZE) >= SRC_DEPTH)

906

{

907

*((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;

}

#endif // HAS_BIAS

}

/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

913

*

914

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

915

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

916

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

917

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

918

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

919

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

920

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

921

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

922

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

923

*

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

924

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

925

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

926

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

927

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

928

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

929

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

930

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

931

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

932

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

933

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

934

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

935

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

936

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

937

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

938

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

939

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

940

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

941

__kernel void im2col_generic_nhwc(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

942

TENSOR3D_DECLARATION(src),

943

IMAGE_DECLARATION(dst),

944

uint src_stride_w,

945

uint dst_stride_w)

946

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

947

const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map

948

const int yo = get_global_id(1);

949

const int batch = get_global_id(2); // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

950

951

// Calculate input indices

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

952

const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;

953

const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

954

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

955

// Get input and output address

956

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;

957

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

958

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

959

int i = 0;

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

960

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

961

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

962

// Clamp yi_coord

963

int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;

964

yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));

965

966

// Out-of-bound condition for Y

967

int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);

968

969

for(int xk = 0; xk < KERNEL_WIDTH; ++xk)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

970

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

971

// Clamp xi_coord

972

int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);

973

xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));

974

975

// Out-of-bound condition for X

976

int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);

977

978

int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);

979

980

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));

981

982

// Replace with PAD_VALUE if the value is out-of-bound

983

values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));

// Store

VSTORE(VECTOR_SIZE)

(values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);

988

989

i++;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

}

}

#ifdef HAS_BIAS

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

994

if((ch + VECTOR_SIZE) >= SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

995

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

996

*((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

997

}

998

#endif // HAS_BIAS

999

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame^]

1000

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)

Pablo Tello