Blame - src/core/CL/cl_kernels/hog.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

128

const float w1 = phase_value - floor(phase_value);

129

130

// The quantised phase is the histogram index [0, NUM_BINS - 1]

131

// Check limit of histogram index. If hidx == NUM_BINS, hidx = 0

132

const uint hidx = (uint)(phase_value) % NUM_BINS;

133

134

// Weighted vote between 2 bins

135

bins[hidx] += mag_value * (1.0f - w1);

136

bins[(hidx + 1) % NUM_BINS] += mag_value * w1;

137

}

138

139

// Point to the next row of magnitude and phase images

140

mag_row_ptr += mag_stride_y;

141

phase_row_ptr += phase_stride_y;

142

}

143

144

// Compute address for the destination image

145

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

146

147

// Store the local HOG in the global memory

148

int xc = 0;

149

for(; xc <= (NUM_BINS - 4); xc += 4)

150

{

151

float4 values = vload4(0, bins + xc);

152

153

vstore4(values, 0, ((__global float *)dst.ptr) + xc);

}

// Left over stores

for(; xc < NUM_BINS; ++xc)

158

{

159

((__global float *)dst.ptr)[xc] = bins[xc];

160

}

161

}

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

162

#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

163

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

164

#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

165

166

#ifndef L2_NORM

167

#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

168

#endif /* not L2_NORM */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

169

170

#ifndef L2HYS_NORM

171

#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

172

#endif /* not L2HYS_NORM */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

173

174

#ifndef L1_NORM

175

#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

176

#endif /* not L1_NORM */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

177

178

/** This OpenCL kernel computes the HOG block normalization

179

*

180

* @attention The following variables must be passed at compile time:

181

*

182

* -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block

183

* -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction

184

* -# -DNUM_BINS_PER_BLOCK = Number of bins for each block

185

* -# -DHOG_NORM_TYPE = Normalization type

186

* -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method

187

* -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM

188

* -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM

189

* -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM

190

*

191

* @note Each work-item computes a single block

192

*

193

* @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell

194

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

195

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

196

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

197

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

198

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

199

* @param[out] dst_ptr Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block

200

* @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)

201

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

202

* @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)

203

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

204

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image

205

*/

206

__kernel void hog_block_normalization(IMAGE_DECLARATION(src),

207

IMAGE_DECLARATION(dst))

208

{

209

float sum = 0.0f;

210

float4 sum_f32 = (float4)(0.0f);

211

212

// Compute address for the source and destination tensor

213

Image src = CONVERT_TO_IMAGE_STRUCT(src);

214

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

215

216

for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)

217

{

218

const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);

219

220

int xc = 0;

221

for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)

222

{

223

const float4 val0 = vload4(0, hist_ptr + xc + 0);

224

const float4 val1 = vload4(0, hist_ptr + xc + 4);

225

const float4 val2 = vload4(0, hist_ptr + xc + 8);

226

const float4 val3 = vload4(0, hist_ptr + xc + 12);

227

228

#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)

229

// Compute val^2 for L2_NORM or L2HYS_NORM

230

sum_f32 += val0 * val0;

231

sum_f32 += val1 * val1;

232

sum_f32 += val2 * val2;

233

sum_f32 += val3 * val3;

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

234

#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

235

// Compute |val| for L1_NORM

236

sum_f32 += fabs(val0);

237

sum_f32 += fabs(val1);

238

sum_f32 += fabs(val2);

239

sum_f32 += fabs(val3);

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

240

#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

241

242

// Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.

243

// This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values

244

// will be accessed consecutively

245

vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);

246

vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);

247

vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);

248

vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);

}

// Compute left over

for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)

253

{

254

const float val = hist_ptr[xc];

255

256

#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)

257

sum += val * val;

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

258

#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

259

sum += fabs(val);

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

260

#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

261

262

((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;

}

}

sum += dot(sum_f32, (float4)1.0f);

267

268

float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);

269

270

#if(HOG_NORM_TYPE == L2HYS_NORM)

271

// Reset sum

272

sum_f32 = (float4)0.0f;

sum = 0.0f;

int k = 0;

for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)

277

{

278

float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);

279

float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);

280

float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);

281

float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);

282

283

// Scale val

284

val0 = val0 * (float4)scale;

285

val1 = val1 * (float4)scale;

286

val2 = val2 * (float4)scale;

287

val3 = val3 * (float4)scale;

288

289

// Clip val if over _threshold_l2hys

290

val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);

291

val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);

292

val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);

293

val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);

294

295

// Compute val^2

296

sum_f32 += val0 * val0;

297

sum_f32 += val1 * val1;

298

sum_f32 += val2 * val2;

299

sum_f32 += val3 * val3;

300

301

vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);

302

vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);

303

vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);

304

vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);

}

// Compute left over

for(; k < NUM_BINS_PER_BLOCK; ++k)

309

{

310

float val = ((__global float *)dst.ptr)[k] * scale;

311

312

// Clip scaled input_value if over L2_HYST_THRESHOLD

313

val = fmin(val, (float)L2_HYST_THRESHOLD);

sum += val * val;

((__global float *)dst.ptr)[k] = val;

318

}

319

320

sum += dot(sum_f32, (float4)1.0f);

321

322

// We use the same constants of OpenCV

323

scale = 1.0f / (sqrt(sum) + 1e-3f);

324

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

325

#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

326

327

int i = 0;

328

for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)

329

{

330

float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);

331

float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);

332

float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);

333

float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);

334

335

// Multiply val by the normalization scale factor

336

val0 = val0 * (float4)scale;

337

val1 = val1 * (float4)scale;

338

val2 = val2 * (float4)scale;

339

val3 = val3 * (float4)scale;

340

341

vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);

342

vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);

343

vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);

344

vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);

345

}

346

347

for(; i < NUM_BINS_PER_BLOCK; ++i)

348

{

349

((__global float *)dst.ptr)[i] *= scale;

350

}

351

}

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

352

#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

353

John Richardson

2018-01-09 11:17:00 +0000

[diff] [blame]

354

#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

355

356

/** This OpenCL kernel computes the HOG detector using linear SVM

357

*

358

* @attention The following variables must be passed at compile time:

359

*

360

* -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction

361

* -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction

362

* -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane

363

* -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array

364

* -# -DIDX_CLASS = Index of the class to detect

John Richardson

2018-01-09 11:17:00 +0000

[diff] [blame]

365

* -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction

366

* -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

367

* -# -DDETECTION_WINDOW_WIDTH = Width of the detection window

368

* -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window

369

*

370

* @note Each work-item computes a single detection window

371

*

372

* @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell

373

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

374

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

375

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

376

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

377

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

378

* @param[in] hog_descriptor Pointer to HOG descriptor. Supported data types: F32

379

* @param[out] dst Pointer to DetectionWindow array

380

* @param[out] num_detection_windows Number of objects detected

381

*/

382

__kernel void hog_detector(IMAGE_DECLARATION(src),

383

__global float *hog_descriptor,

384

__global DetectionWindow *dst,

385

__global uint *num_detection_windows)

386

{

387

// Check if the DetectionWindow array is full

388

if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)

{

return;

}

Image src = CONVERT_TO_IMAGE_STRUCT(src);

394

395

const int src_step_y_f32 = src_stride_y / sizeof(float);

396

397

// Init score_f32 with 0

398

float4 score_f32 = (float4)0.0f;

// Init score with 0

float score = 0.0f;

__global float *src_row_ptr = (__global float *)src.ptr;

404

405

// Compute Linear SVM

406

for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)

{

int xb = 0;

const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;

411

412

for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)

413

{

414

// Load descriptor values

415

float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);

416

float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);

417

418

float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);

419

float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);

420

421

// Multiply accumulate

422

score_f32 += a0_f32 * b0_f32;

423

score_f32 += a1_f32 * b1_f32;

424

}

425

426

for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)

427

{

428

const float a = src_row_ptr[xb];

429

const float b = hog_descriptor[xb + offset_y];

score += a * b;

}

}

score += dot(score_f32, (float4)1.0f);

436

437

// Add the bias. The bias is located at the position (descriptor_size() - 1)

438

// (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y

439

score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];

440

441

if(score > (float)THRESHOLD)

442

{

443

int id = atomic_inc(num_detection_windows);

444

if(id < MAX_NUM_DETECTION_WINDOWS)

445

{

John Richardson

2018-01-09 11:17:00 +0000

[diff] [blame]

446

dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;

447

dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

448

dst[id].width = DETECTION_WINDOW_WIDTH;

449

dst[id].height = DETECTION_WINDOW_HEIGHT;

450

dst[id].idx_class = IDX_CLASS;

451

dst[id].score = score;

452

}

453

}

454

}

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

455

#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&

John Richardson