Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

2018-02-15 13:37:40 +0000

[diff] [blame]

132

__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

133

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

134

TENSOR3D_DECLARATION(maxo),

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

135

TENSOR3D_DECLARATION(dst),

136

TENSOR3D_DECLARATION(sum),

137

uint width)

138

{

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

139

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

140

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

141

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

142

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

143

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

144

VEC_UCHAR max_val_vec = 0;

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

145

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

146

// Calculate max of row

147

const uint width4 = width >> LOG_VECTOR_SIZE;

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

148

for(uint i = 0; i < width4; i++)

149

{

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

150

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

151

max_val_vec = MAX_OP(data, max_val_vec, uchar, 16);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

152

}

153

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

154

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

155

// Handle non multiple of 16

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

156

VEC_UCHAR uchar_min = (VEC_UCHAR)0;

157

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

158

VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);

159

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);

160

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

161

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

162

// Perform max reduction

163

#if VECTOR_SIZE == 16

164

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);

165

#endif /* VECTOR SIZE 16 END */

166

#if VECTOR_SIZE >= 8

167

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);

168

#endif /* VECTOR SIZE 8 END */

169

#if VECTOR_SIZE >= 4

170

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

171

#endif /* VECTOR SIZE 4 END */

172

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

173

174

// Store result

175

*((__global uchar *)maxo.ptr) = max_val_vec.s0;

// Second part

// Load max value of 1D logits vector (row)

180

int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));

181

182

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

183

VEC_INT sum1D = 0;

184

185

// Shift values, exp and sum

186

for(uint i = 0; i < width4; i++)

187

{

188

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

189

VEC_INT data_fp = CONVERT(data, VEC_INT);

190

VEC_INT data_diff = data_fp - max_val;

191

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

192

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

193

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

194

VSTORE(VECTOR_SIZE)

195

(data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

196

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

197

}

198

199

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

200

// Handle non multiple of 16

201

data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

202

VEC_INT data_fp = CONVERT(data, VEC_INT);

203

VEC_INT data_diff = data_fp - max_val;

204

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

205

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

206

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

207

VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);

208

VSTORE(VECTOR_SIZE)

209

(data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));

210

data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

211

sum1D = sum1D + select(0, data_fp, widx_);

212

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

213

214

// Perform sum reduction

215

#if VECTOR_SIZE == 16

216

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);

217

#endif /* VECTOR SIZE 16 END */

218

#if VECTOR_SIZE >= 8

219

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);

220

#endif /* VECTOR SIZE 8 END */

221

#if VECTOR_SIZE >= 4

222

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);

223

#endif /* VECTOR SIZE 4 END */

224

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

225

226

// Calculate and store result

227

*((__global int *)sum.ptr) = sum1D.s0;

228

}

229

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

230

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

231

* then gets the exponent of each element as sums all elements across each row.

232

*

233

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

234

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

235

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

236

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

237

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

238

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

239

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

240

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

241

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

242

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

243

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

244

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

245

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

246

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

247

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

248

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

249

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

250

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

251

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

252

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

253

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

254

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

255

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

256

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

257

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

258

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

259

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

260

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

261

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

262

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

263

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

264

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

265

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

266

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

267

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

268

* @param[in] width Input image width

269

*/

270

__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(

271

TENSOR3D_DECLARATION(src),

272

TENSOR3D_DECLARATION(maxo),

273

TENSOR3D_DECLARATION(dst),

274

TENSOR3D_DECLARATION(sum),

275

uint width)

276

{

277

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

278

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

279

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

280

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

281

282

const uint4 idx4 = (uint4)(0, 1, 2, 3);

283

const uint lid = get_local_id(0);

284

285

// Define one temporary vector per work-item.

286

__local int4 tmp_local[GRID_SIZE];

287

__local uchar max_local;

288

289

uchar4 uchar_min = (uchar4)0;

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

290

uchar4 max_val_vec = uchar_min;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

291

292

// Number of elements per work-item.

293

const uint row = width / GRID_SIZE;

294

// Number of iterations per work-item.

295

const uint width_ = row >> 2;

296

// Calculate max of row

297

uint i = 0;

298

for(; i < width_; i++)

299

{

300

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

301

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

302

}

303

#ifdef NON_MULTIPLE_OF_GRID_SIZE

304

// How many work-items needed to complete the computation.

305

//TODO: Optimize this calculation (avoid %).

306

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

307

if(lid < boundary_workitems)

308

{

309

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

310

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

311

}

312

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

313

if(boundary_workitems == 0)

314

{

315

boundary_workitems = GRID_SIZE;

316

i--;

317

}

318

if(lid == (boundary_workitems - 1))

319

{

320

// Handle non multiple of 4

321

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

322

uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

323

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);

324

}

325

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

326

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

327

tmp_local[lid] = convert_int4(max_val_vec);

328

329

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

336

}

337

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

344

}

345

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

352

}

353

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

360

}

361

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

368

}

369

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

376

}

377

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

384

}

385

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);

390

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

391

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

392

max_local = max_val_vec.s0;

393

}

394

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

int4 sum1D = 0;

int max_val = convert_int(max_local);

401

402

// Shift values, exp and sum

403

for(i = 0; i < width_; i++)

404

{

405

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

406

int4 data_fp = convert_int4(data);

407

int4 data_diff = data_fp - max_val;

408

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

409

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

410

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

411

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

412

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

413

}

414

#ifdef NON_MULTIPLE_OF_GRID_SIZE

415

//TODO: Optimize the calculation (avoid %).

416

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

417

if(lid < boundary_workitems)

418

{

419

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

420

int4 data_fp = convert_int4(data);

421

int4 data_diff = data_fp - max_val;

422

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

423

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

424

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

425

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

426

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

427

}

428

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

429

if(boundary_workitems == 0)

430

{

431

boundary_workitems = GRID_SIZE;

432

i--;

433

}

434

if(lid == (boundary_workitems - 1))

435

{

436

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

437

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));

438

int4 data_fp = convert_int4(data);

439

int4 data_diff = data_fp - max_val;

440

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

441

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

442

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

443

int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

444

data_fp = select(0, data_fp, widx);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

445

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));

446

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

447

}

448

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

449

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

450

tmp_local[lid] = sum1D;

451

452

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

459

}

460

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

467

}

468

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

475

}

476

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

483

}

484

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

491

}

492

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

499

}

500

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

507

}

508

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);

513

// Perform max reduction

514

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);

515

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);

516

*((__global int *)sum.ptr) = sum1D.s0;

}

}

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

520

/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

521

*

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

522

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

523

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

524

*

525

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32

526

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

527

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

528

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

529

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

530

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

531

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

532

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

533

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

534

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

535

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

536

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

537

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

538

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

539

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

540

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

541

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8

542

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

543

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

544

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

545

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

546

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

547

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

548

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

549

*/

550

__kernel void softmax_layer_norm_quantized(

551

TENSOR3D_DECLARATION(src),

552

TENSOR3D_DECLARATION(sum),

553

TENSOR3D_DECLARATION(dst))

554

{

555

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

556

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

557

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

558

559

// Load max value of 1D logits vector (row)

560

int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));

561

562

// It will be better to calculate this in prev layer and pass here as parameter

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

563

#ifndef LOG_SOFTMAX

Giorgio Arena

af724e7

2019-11-12 14:45:14 +0000

[diff] [blame]

564

uint sum_val_u = convert_uint(sum_val);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

565

int headroom_plus_one = clz(sum_val_u);

566

int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;

567

int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));

568

int16 shifted_sum_minus_one = shifted_sum_minus_one_1;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

569

int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

570

#endif /* LOG_SOFTMAX */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

571

572

// It was already calculated in prev layer, should be stored into tmp output and reused

573

int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

574

int16 data_diff_mult = data_diff;

575

#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)

576

if(INPUT_BETA_MULTIPLIER > 1)

577

{

578

data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);

579

}

580

#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

581

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

582

#ifdef LOG_SOFTMAX

Giorgio Arena

af724e7

2019-11-12 14:45:14 +0000

[diff] [blame]

583

long16 data = SUB_OP(convert_long16(data_diff_mult), (long16)(sum_val), long, 16);

584

data = select(0L, data, convert_long16(data_diff) >= (long16)(DIFF_MIN));

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

585

#else /* LOG_SOFTMAX */

Giorgio Arena

af724e7

2019-11-12 14:45:14 +0000

[diff] [blame]

586

int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);

587

data = ASYMM_MULT(shifted_scale, data, 16);

588

data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);

589

data = select(0, data, data_diff >= (int16)(DIFF_MIN));

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

590

#endif /* LOG_SOFTMAX */

Georgios Pinitas

47b5603

2017-11-29 16:09:48 +0000

[diff] [blame]

591

vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));

Chunosov