Blame - src/core/CL/cl_kernels/im2col.cl - ml/ComputeLibrary

2018-07-18 16:13:12 +0100

[diff] [blame]

107

#if defined(NUM_GROUPS)

108

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;

109

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

110

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

111

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

112

113

VEC_DATA_TYPE(DATA_TYPE, 4)

114

data = vload4(0, (__global DATA_TYPE *)input_ptr);

115

116

// If out-of-bound, overwrite with the first element

117

data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);

118

119

*(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;

120

*(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;

121

*(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;

122

*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;

123

124

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

125

#if defined(NUM_GROUPS)

126

if(xo == (SRC_DEPTH / NUM_GROUPS - 1))

127

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

128

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

129

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

130

{

131

*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;

132

*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;

133

*((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;

134

*((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;

135

}

136

#endif // HAS_BIAS

137

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

138

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

139

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

140

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

141

#if defined(DILATION_X) && defined(DILATION_Y)

142

/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

143

*

144

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

145

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

146

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

147

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

148

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

149

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

150

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Georgios Pinitas

19ea419

2018-06-19 13:09:53 +0100

[diff] [blame]

151

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

152

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

153

* @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

154

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

155

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

156

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

157

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

158

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

159

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

160

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

161

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

162

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

163

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

164

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

165

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

166

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

167

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

168

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

169

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

170

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

171

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

172

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

173

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

174

__kernel void im2col_generic_nchw(

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

175

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

176

#if defined(NUM_GROUPS)

177

TENSOR3D_DECLARATION(dst),

178

#else // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

179

IMAGE_DECLARATION(dst),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

180

#endif // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

181

uint src_stride_w,

182

uint dst_stride_w)

183

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

184

const int xc = get_global_id(0); // x coordinate in the convolved tensor

185

const int yc = get_global_id(1); // y coordinate in the convolved tensor

186

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

187

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

188

189

// Calculate input indices

190

const int xi = xc * STRIDE_X - PAD_LEFT;

191

const int yi = yc * STRIDE_Y - PAD_TOP;

192

193

// Calculate output indices

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

194

#if defined(NUM_GROUPS)

195

const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;

196

const int zo = ch / (SRC_DEPTH / NUM_GROUPS);

197

#else // defined(NUM_GROUPS)

198

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

199

#endif // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

200

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

201

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

202

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

203

#if defined(NUM_GROUPS)

204

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;

205

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

206

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

207

#endif // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

208

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

209

// Linearize convolution elements

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

210

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

211

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

212

int y = yi + yk * DILATION_Y;

213

for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

214

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

215

int x = xi + xk * DILATION_X;

216

#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

217

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

218

#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

219

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

220

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

221

*output_ptr = PAD_VALUE;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

222

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

223

else

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

224

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

225

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

226

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

227

#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

228

}

229

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

230

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

231

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

232

#if defined(NUM_GROUPS)

233

if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))

234

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

235

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

236

#endif // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

237

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

238

*output_ptr = 1.0f;

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

239

}

240

#endif // HAS_BIAS

241

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

242

#endif // defined(DILATION_X) && defined(DILATION_Y)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

243

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

244

/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

245

*

246

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

247

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

248

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

249

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

250

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

251

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

252

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

253

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

254

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

255

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

256

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

257

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

258

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

259

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

260

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

261

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

262

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

263

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

264

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

265

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

266

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

267

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

268

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

269

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

270

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

271

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

272

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

273

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

274

__kernel void im2col3x3_nchw(

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

275

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

276

#if defined(NUM_GROUPS)

277

TENSOR3D_DECLARATION(dst),

278

#else // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

279

IMAGE_DECLARATION(dst),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

280

#endif // defined(NUM_GROUPS)

Pablo Tello

2018-04-04 10:01:14 +0100

[diff] [blame]

281

uint src_stride_w,

282

uint dst_stride_w)

283

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

284

const int xc = get_global_id(0); // x coordinate in the convolved tensor

285

const int yc = get_global_id(1); // y coordinate in the convolved tensor

286

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

287

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

288

289

// Calculate input indices

290

const int xi = xc * STRIDE_X - PAD_LEFT;

291

const int yi = yc * STRIDE_Y - PAD_TOP;

292

293

// Calculate output indices

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

294

#if defined(NUM_GROUPS)

295

const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3

296

const int zo = ch / (SRC_DEPTH / NUM_GROUPS);

297

#else // defined(NUM_GROUPS)

298

const int xo = ch * 9; // 3x3

299

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

300

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

301

302

// Get input and output address

303

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

304

#if defined(NUM_GROUPS)

305

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;

306

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

307

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

308

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

309

310

VEC_DATA_TYPE(DATA_TYPE, 3)

311

row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

312

VEC_DATA_TYPE(DATA_TYPE, 3)

313

row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

314

VEC_DATA_TYPE(DATA_TYPE, 3)

315

row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

316

317

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

318

// Put 0 if the value is out-of-bound

319

int3 x = (int3)xi + (int3)(0, 1, 2);

320

int3 y = (int3)yi + (int3)(0, 1, 2);

321

322

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

323

cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

324

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

325

cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

326

VEC_DATA_TYPE(COND_DATA_TYPE, 3)

327

cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));

328

329

row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);

330

row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);

331

row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);

332

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

333

334

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);

335

*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;

336

337

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

338

#if defined(NUM_GROUPS)

339

if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))

340

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

341

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

342

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

343

{

344

*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

349

/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

350

*

351

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

352

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

353

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

354

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

355

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

356

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

357

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

358

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

359

* @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

360

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

361

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

362

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

363

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

364

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

365

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

366

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

367

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

368

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

369

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

370

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

371

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

372

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

373

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

374

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

375

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

376

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

377

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

378

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

379

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

380

__kernel void im2col5x5_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

381

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

382

#if defined(NUM_GROUPS)

383

TENSOR3D_DECLARATION(dst),

384

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

385

IMAGE_DECLARATION(dst),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

386

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

387

uint src_stride_w,

388

uint dst_stride_w)

389

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

390

const int xc = get_global_id(0); // x coordinate in the convolved tensor

391

const int yc = get_global_id(1); // y coordinate in the convolved tensor

392

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

393

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

394

395

// Calculate input indices

396

const int xi = xc * STRIDE_X - PAD_LEFT;

397

const int yi = yc * STRIDE_Y - PAD_TOP;

398

399

// Calculate output indices

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

400

#if defined(NUM_GROUPS)

401

const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5

402

const int zo = ch / (SRC_DEPTH / NUM_GROUPS);

403

#else // defined(NUM_GROUPS)

404

const int xo = ch * 25; // 5x5

405

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

406

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

407

408

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

409

// Put 0 if the value is out-of-bound

410

int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);

411

int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);

int x1 = xi + 4;

int y1 = yi + 4;

// Check if we could have out-of-bounds elements in the x direction

416

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

417

x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

418

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

419

y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));

420

COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);

421

COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);

422

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

423

424

// Get input and output address

425

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

426

#if defined(NUM_GROUPS)

427

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;

428

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

429

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

430

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

431

432

{

433

VEC_DATA_TYPE(DATA_TYPE, 4)

434

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

435

DATA_TYPE

436

row01 = *((__global DATA_TYPE *)input_ptr + 4);

437

438

input_ptr += src_stride_y;

439

440

VEC_DATA_TYPE(DATA_TYPE, 4)

441

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

442

DATA_TYPE

443

row11 = *((__global DATA_TYPE *)input_ptr + 4);

444

445

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

446

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

447

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;

448

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

449

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;

450

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);

451

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);

452

453

// Replace with 0 if the value is not valid

454

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

455

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

456

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

457

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

458

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

459

460

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

461

row10.s012),

462

0, (__global DATA_TYPE *)output_ptr);

463

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

464

465

input_ptr += src_stride_y;

466

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

471

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

472

DATA_TYPE

473

row01 = *((__global DATA_TYPE *)input_ptr + 4);

474

475

input_ptr += src_stride_y;

476

477

VEC_DATA_TYPE(DATA_TYPE, 4)

478

row10 = vload4(0, (__global DATA_TYPE *)input_ptr);

479

DATA_TYPE

480

row11 = *((__global DATA_TYPE *)input_ptr + 4);

481

482

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

483

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

484

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;

485

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

486

cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;

487

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);

488

COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);

489

490

// Replace with 0 if the value is not valid

491

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

492

row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);

493

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

494

row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);

495

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

496

497

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,

498

row10.s012),

499

0, (__global DATA_TYPE *)output_ptr);

500

vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);

501

502

input_ptr += src_stride_y;

503

output_ptr += 10 * dst_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 4)

508

row00 = vload4(0, (__global DATA_TYPE *)input_ptr);

509

DATA_TYPE

510

row01 = *((__global DATA_TYPE *)input_ptr + 4);

511

512

input_ptr += src_stride_y;

513

514

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

515

VEC_DATA_TYPE(COND_DATA_TYPE, 4)

516

cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;

517

COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);

518

519

// Replace with 0 if the value is not valid

520

row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);

521

row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);

522

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

523

524

vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);

525

*((__global DATA_TYPE *)output_ptr + 4) = row01;

526

527

output_ptr += 5 * dst_stride_x;

528

}

529

530

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

531

#if defined(NUM_GROUPS)

532

if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))

533

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

534

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

535

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

536

{

537

*((__global DATA_TYPE *)output_ptr) = 1.0f;

538

}

539

#endif // HAS_BIAS

540

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

541

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

542

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

543

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)

544

/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

545

*

546

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

547

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

548

* @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

549

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

550

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

551

* @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

552

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

553

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

554

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

555

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

556

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

557

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

558

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

559

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

560

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

561

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

562

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

563

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

564

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

565

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

566

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

567

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

568

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

569

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

570

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

571

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

572

__kernel void im2col11x11_padx0_pady0_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

573

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

574

#if defined(NUM_GROUPS)

575

TENSOR3D_DECLARATION(dst),

576

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

577

IMAGE_DECLARATION(dst),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

578

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

579

uint src_stride_w,

580

uint dst_stride_w)

581

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

582

const int xc = get_global_id(0); // x coordinate in the convolved tensor

583

const int yc = get_global_id(1); // y coordinate in the convolved tensor

584

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

585

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

586

587

// Calculate input indices

588

const int xi = xc * STRIDE_X;

589

const int yi = yc * STRIDE_Y;

590

591

// Calculate output indices

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

592

#if defined(NUM_GROUPS)

593

const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11

594

const int zo = ch / (SRC_DEPTH / NUM_GROUPS);

595

#else // defined(NUM_GROUPS)

596

const int xo = ch * 121; // 11x11

597

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

598

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

599

600

// Get input and output address

601

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

602

#if defined(NUM_GROUPS)

603

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;

604

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

605

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

606

#endif // defined(NUM_GROUPS)

607

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

608

{

609

VEC_DATA_TYPE(DATA_TYPE, 8)

610

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

611

VEC_DATA_TYPE(DATA_TYPE, 3)

612

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

613

614

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

615

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

616

617

input_ptr += src_stride_y;

618

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

623

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

624

VEC_DATA_TYPE(DATA_TYPE, 3)

625

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

626

627

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

628

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

629

630

input_ptr += src_stride_y;

631

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

636

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

637

VEC_DATA_TYPE(DATA_TYPE, 3)

638

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

639

640

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

641

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

642

643

input_ptr += src_stride_y;

644

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

649

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

650

VEC_DATA_TYPE(DATA_TYPE, 3)

651

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

652

653

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

654

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

655

656

input_ptr += src_stride_y;

657

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

662

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

663

VEC_DATA_TYPE(DATA_TYPE, 3)

664

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

665

666

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

667

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

668

669

input_ptr += src_stride_y;

670

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

675

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

676

VEC_DATA_TYPE(DATA_TYPE, 3)

677

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

678

679

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

680

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

681

682

input_ptr += src_stride_y;

683

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

688

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

689

VEC_DATA_TYPE(DATA_TYPE, 3)

690

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

691

692

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

693

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

694

695

input_ptr += src_stride_y;

696

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

701

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

702

VEC_DATA_TYPE(DATA_TYPE, 3)

703

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

704

705

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

706

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

707

708

input_ptr += src_stride_y;

709

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

714

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

715

VEC_DATA_TYPE(DATA_TYPE, 3)

716

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

717

718

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

719

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

720

721

input_ptr += src_stride_y;

722

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

727

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

728

VEC_DATA_TYPE(DATA_TYPE, 3)

729

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

730

731

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

732

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

733

734

input_ptr += src_stride_y;

735

output_ptr += 11 * src_stride_x;

}

{

VEC_DATA_TYPE(DATA_TYPE, 8)

740

row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));

741

VEC_DATA_TYPE(DATA_TYPE, 3)

742

row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);

743

744

vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);

745

vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);

746

747

output_ptr += 11 * src_stride_x;

748

}

749

750

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

751

#if defined(NUM_GROUPS)

752

if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))

753

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

754

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

755

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

756

{

757

*((__global DATA_TYPE *)output_ptr) = 1.0f;

758

}

759

#endif // HAS_BIAS

760

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

761

#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

762

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

763

#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

764

/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

765

*

766

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.

767

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.

768

* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

769

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

770

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

771

* @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

772

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

773

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

774

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

775

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

776

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

777

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

778

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

779

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

780

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

781

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

782

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

783

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

784

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

785

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

786

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

787

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

788

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

789

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

790

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

791

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

792

__kernel void im2col_generic_padx0_pady0_nchw(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

793

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

794

#if defined(NUM_GROUPS)

795

TENSOR3D_DECLARATION(dst),

796

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

797

IMAGE_DECLARATION(dst),

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

798

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

799

uint src_stride_w,

800

uint dst_stride_w)

801

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

802

const int xc = get_global_id(0); // x coordinate in the convolved tensor

803

const int yc = get_global_id(1); // y coordinate in the convolved tensor

804

const int ch = get_global_id(2) % SRC_DEPTH; // input feature map

805

const int batch = get_global_id(2) / SRC_DEPTH; // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

806

807

// Calculate input indices

808

const int xi = xc * STRIDE_X;

809

const int yi = yc * STRIDE_Y;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

810

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

811

// Calculate output indices

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

812

#if defined(NUM_GROUPS)

813

const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;

814

const int zo = ch / (SRC_DEPTH / NUM_GROUPS);

815

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

816

const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

817

#endif // defined(NUM_GROUPS)

818

const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution

819

820

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;

821

#if defined(NUM_GROUPS)

822

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;

823

#else // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

824

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

825

#endif // defined(NUM_GROUPS)

826

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

827

// Linearize convolution elements

828

for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)

829

{

830

int last_x = 0;

831

for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)

832

{

833

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

834

row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

835

VSTORE(VECTOR_SIZE)

836

(row, 0, output_ptr);

837

last_x = x;

838

}

839

// Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).

840

// Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.

841

#if WIDTH_MOD_VECTOR_SIZE == 1

842

*output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

843

#elif WIDTH_MOD_VECTOR_SIZE > 1

844

VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)

845

row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));

846

VSTORE(WIDTH_MOD_VECTOR_SIZE)

847

(row, 0, output_ptr);

848

#endif /* WIDTH_MOD_VECTOR_SIZE */

849

output_ptr += WIDTH_MOD_VECTOR_SIZE;

850

} /* End of loop over KERNEL_HEIGHT */

851

852

#ifdef HAS_BIAS

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

853

#if defined(NUM_GROUPS)

854

if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))

855

#else // defined(NUM_GROUPS)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

856

if(ch == (SRC_DEPTH - 1))

Giorgio Arena

2018-07-18 16:13:12 +0100

[diff] [blame]

857

#endif // defined(NUM_GROUPS)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

858

{

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

859

*output_ptr = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

860

}

861

#endif // HAS_BIAS

862

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

863

#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

864

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

865

#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

866

867

#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

868

#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

869

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

870

/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension

871

* @name IM2COL1X9_NHWC_STORE

872

*

873

* @note To use this macro for a 3x3 block, @p ROW has to be 0

874

*

875

* @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16

876

* @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size

877

* @param[in] DATA_TYPE Data type of @p DATA

878

* @param[in] SRC_DEPTH Input channel size / depth

879

* @param[in] DATA Value variable base name

880

* @param[in] ROW The row number to store. Supported: 0-8

881

* @param[in] OUTPUT_PTR Output pointer

882

* @{

883

*/

884

#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE

885

#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

886

const bool at_channel_boundary = get_global_id(0) == 0; \

887

if(at_channel_boundary) \

888

{ \

889

IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

} \

else \

{ \

IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

894

}

895

#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE

896

#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

897

IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)

898

#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE

899

900

#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

901

VSTORE(VECTOR_SIZE) \

902

(DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \

903

VSTORE(VECTOR_SIZE) \

904

(DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \

905

VSTORE(VECTOR_SIZE) \

906

(DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \

907

VSTORE(VECTOR_SIZE) \

908

(DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \

909

VSTORE(VECTOR_SIZE) \

910

(DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \

911

VSTORE(VECTOR_SIZE) \

912

(DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \

913

VSTORE(VECTOR_SIZE) \

914

(DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \

915

VSTORE(VECTOR_SIZE) \

916

(DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \

917

VSTORE(VECTOR_SIZE) \

918

(DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);

919

920

#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \

921

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

922

(DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \

923

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

924

(DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \

925

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

926

(DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \

927

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

928

(DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \

929

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

930

(DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \

931

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

932

(DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \

933

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

934

(DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \

935

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

936

(DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \

937

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \

938

(DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);

939

/** @}*/

940

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

941

/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC

942

*

943

* @note This kernel computes VECTOR_SIZE elements

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

944

* @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements

SiCong Li

315a20e

2020-08-11 14:10:30 +0100

[diff] [blame]

945

* @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2

946

* @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

947

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

948

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

949

* @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

950

* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1

951

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

952

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

953

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

954

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

955

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

956

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

957

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

958

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

959

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

960

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

961

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

962

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

963

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

964

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

965

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

966

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

967

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

968

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

969

*/

970

__kernel void im2col3x3_nhwc(

971

TENSOR3D_DECLARATION(src),

972

IMAGE_DECLARATION(dst),

973

uint src_stride_w,

974

uint dst_stride_w)

975

{

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

976

// input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding

977

const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;

978

const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);

979

const int yo = get_global_id(1);

980

const int batch = get_global_id(2); // batch size

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

981

982

// Calculate input indices

983

const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;

984

const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;

985

986

// Get input and output address

987

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;

988

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;

int yi_coord = 0;

int3 offset = 0;

// Clamp xi

int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

995

#if PAD_LEFT != 0 || PAD_RIGHT != 0

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

996

#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)

997

xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

998

#endif // PAD_LEFT != 0 || PAD_RIGHT != 0

999

// Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1000

xi_offset *= (int3)src_stride_y;

1001

1002

// Out-of-bound condition for X

1003

int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);

// yi == 0

// Clamp yi

// yi_coord is casted to unsigned int in order to use just a min() operation

1008

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1009

// This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),

1010

// because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around

1011

// also causes y_cond (y padding condition) to be satisfied

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1012

yi_coord = yi - (int)PAD_TOP;

1013

1014

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

1015

#if PAD_TOP != 0 || PAD_BOTTOM != 0

1016

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

1017

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

1018

1019

// Compute offset

1020

offset = xi_offset + (yi_coord * (int)src_stride_z);

1021

1022

// Load input values

1023

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

1024

VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

1025

VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

1026

1027

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1028

// Replace invalid values with PAD_VALUE

1029

int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1030

values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));

1031

values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));

1032

values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1033

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1034

1035

// yi == 1

1036

// Clamp yi_coord (it can be negative if PAD_TOP > 1)

1037

yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;

1038

1039

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

1040

#if PAD_TOP != 0 || PAD_BOTTOM != 0

1041

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

1042

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

1043

1044

// Compute offset

1045

offset = xi_offset + (yi_coord * (int)src_stride_z);

1046

1047

// Load input values

1048

VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

1049

VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

1050

VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

1051

1052

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1053

// Replace invalid values with zeros

Gian Marco Iodice

2318fcf

2018-08-09 15:50:07 +0100

[diff] [blame]

1054

y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1055

values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));

1056

values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));

1057

values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1058

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

// yi == 2

// Clamp yi_coord

yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;

1063

1064

// Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0

1065

#if PAD_TOP != 0 || PAD_BOTTOM != 0

1066

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));

1067

#endif // PAD_TOP != 0 || PAD_BOTTOM != 0

1068

1069

// Compute offset

1070

offset = xi_offset + (yi_coord * (int)src_stride_z);

1071

1072

// Load input values

1073

VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));

1074

VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));

1075

VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));

1076

1077

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1078

// Replace invalid values with PAD_VALUE

Gian Marco Iodice

2318fcf

2018-08-09 15:50:07 +0100

[diff] [blame]

1079

y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1080

values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));

1081

values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));

1082

values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1083

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1084

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1085

// Store in a boundary-aware way to avoid padding

1086

IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1087

1088

#ifdef HAS_BIAS

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1089

// We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is

1090

// added at the end of the channel, while the boundary vec is at the beginning of the channel.

1091

// The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in

1092

// the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE

1093

// See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1094

if((ch + VECTOR_SIZE) >= SRC_DEPTH)

1095

{

1096

*((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1101

#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1102

#define IM2COL1x9(i) \

1103

({ \

1104

yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \

1105

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1106

\

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1107

offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \

1108

offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1109

\

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1110

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \

1111

VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \

1112

VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \

1113

VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \

1114

VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \

1115

VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \

1116

VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \

1117

VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \

1118

VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1119

\

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1120

int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \

1121

values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \

1122

values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \

1123

values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \

1124

values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \

1125

values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \

1126

values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \

1127

values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \

1128

values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \

1129

values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1130

\

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1131

IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1132

})

1133

#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1134

#define IM2COL1x9(i) \

1135

({ \

1136

yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \

1137

yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1138

\

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1139

offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \

1140

offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1141

\

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1142

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \

1143

VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \

1144

VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \

1145

VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \

1146

VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \

1147

VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \

1148

VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \

1149

VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \

1150

VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1151

\

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1152

IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1153

})

1154

#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0

1155

1156

/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC

1157

*

1158

* @note This kernel computes VECTOR_SIZE elements

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1159

* @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements

SiCong Li

315a20e

2020-08-11 14:10:30 +0100

[diff] [blame]

1160

* @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2

1161

* @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1162

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

1163

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

1164

* @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3

1165

* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1

1166

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

1167

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

1168

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1169

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1170

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1171

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1172

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1173

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1174

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1175

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1176

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1177

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1178

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1179

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1180

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1181

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1182

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

1183

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

1184

*/

1185

__kernel void im2col9x9_nhwc(

1186

TENSOR3D_DECLARATION(src),

1187

IMAGE_DECLARATION(dst),

1188

uint src_stride_w,

1189

uint dst_stride_w)

1190

{

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1191

// input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding

1192

const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;

1193

const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);

1194

const int yo = get_global_id(1);

1195

const int batch = get_global_id(2); // batch size

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1196

1197

// Calculate input indices

1198

const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;

1199

const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;

1200

1201

// Get input and output address

1202

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;

1203

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;

int yi_coord = 0;

int8 offset0 = 0;

int offset1 = 0;

// Clamp xi

int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);

1211

int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);

1212

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1213

#if PAD_LEFT != 0 || PAD_RIGHT != 0

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1214

#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)

1215

xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));

1216

xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1217

#endif // PAD_LEFT != 0 || PAD_RIGHT != 0

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1218

xi_offset0 *= (int8)src_stride_y;

1219

xi_offset1 *= (int)src_stride_y;

1220

1221

// Out-of-bound condition for X

1222

int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);

1223

int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);

IM2COL1x9(0);

IM2COL1x9(1);

IM2COL1x9(2);

IM2COL1x9(3);

IM2COL1x9(4);

IM2COL1x9(5);

IM2COL1x9(6);

IM2COL1x9(7);

IM2COL1x9(8);

#ifdef HAS_BIAS

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1236

// We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is

1237

// added at the end of the channel, while the boundary vec is at the beginning of the channel.

1238

// The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in

1239

// the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE

1240

// See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1241

if((ch + VECTOR_SIZE) >= SRC_DEPTH)

1242

{

1243

*((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;

}

#endif // HAS_BIAS

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1248

/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1249

*

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1250

* @note This kernel computes VECTOR_SIZE elements

1251

* @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements

SiCong Li

315a20e

2020-08-11 14:10:30 +0100

[diff] [blame]

1252

* @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2

1253

* @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1254

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

1255

* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128

1256

* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1257

* @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1258

* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2

1259

* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0

1260

* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

1261

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1262

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

1263

*

Sheri Zhang

2020-04-17 14:59:13 +0100

[diff] [blame]

1264

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1265

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1267

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1269

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1272

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1273

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1275

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1277

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1278

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).

1279

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).

1280

*/

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1281

__kernel void im2col_generic_nhwc(

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1282

TENSOR3D_DECLARATION(src),

1283

IMAGE_DECLARATION(dst),

1284

uint src_stride_w,

1285

uint dst_stride_w)

1286

{

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1287

// input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding

1288

const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;

1289

const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);

1290

const int yo = get_global_id(1);

1291

const int batch = get_global_id(2); // batch size

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1292

1293

// Calculate input indices

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1294

const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;

1295

const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1296

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1297

// Get input and output address

1298

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;

1299

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1300

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1301

int i = 0;

Alex Gilday

7da29b6

2018-03-23 14:16:00 +0000

[diff] [blame]

1302

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1303

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1304

// Clamp yi_coord

1305

int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;

1306

yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));

1307

1308

// Out-of-bound condition for Y

1309

int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);

1310

1311

for(int xk = 0; xk < KERNEL_WIDTH; ++xk)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1312

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1313

// Clamp xi_coord

1314

int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);

1315

xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));

1316

1317

// Out-of-bound condition for X

1318

int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);

1319

1320

int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);

1321

1322

VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));

1323

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1324

#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1325

// Replace with PAD_VALUE if the value is out-of-bound

SiCong Li

2020-08-10 19:57:46 +0100

[diff] [blame]

1326

values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1327

#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1328

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1329

// Store in a boundary-aware way to avoid padding

1330

#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE

1331

const bool at_channel_boundary = get_global_id(0) == 0;

1332

if(at_channel_boundary)

1333

{

1334

VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)

1335

(values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);

1336

}

1337

else // at_channel_boundary

1338

#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE

1339

{

1340

VSTORE(VECTOR_SIZE)

1341

(values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);

1342

}

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1343

i++;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

}

}

#ifdef HAS_BIAS

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1348

// We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is

1349

// added at the end of the channel, while the boundary vec is at the beginning of the channel.

1350

// The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in

1351

// the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE

1352

// See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1353

if((ch + VECTOR_SIZE) >= SRC_DEPTH)

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1354

{

Gian Marco Iodice

2018-06-28 16:29:29 +0100

[diff] [blame]

1355

*((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;

Gian Marco

2018-01-29 12:15:32 +0000

[diff] [blame]

1356

}

1357

#endif // HAS_BIAS

1358

}

SiCong Li

2020-08-05 15:04:00 +0100

[diff] [blame]

1359

#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)

Pablo Tello