Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

2018-02-15 13:37:40 +0000

[diff] [blame]

133

__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

134

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

135

TENSOR3D_DECLARATION(maxo),

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

136

TENSOR3D_DECLARATION(dst),

137

TENSOR3D_DECLARATION(sum),

138

uint width)

139

{

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

140

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

141

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

142

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

143

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

144

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

145

VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

146

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

147

// Calculate max of row

148

const uint width4 = width >> LOG_VECTOR_SIZE;

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

149

for(uint i = 0; i < width4; i++)

150

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

151

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

152

max_val_vec = MAX_OP(data, max_val_vec, DATA_TYPE, 16);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

153

}

154

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

155

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

156

// Handle non multiple of 16

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

157

VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);

158

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

159

VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);

160

max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data, widx), DATA_TYPE, 16);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

161

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

162

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

163

// Perform max reduction

164

#if VECTOR_SIZE == 16

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

165

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

166

#endif /* VECTOR SIZE 16 END */

167

#if VECTOR_SIZE >= 8

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

168

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

169

#endif /* VECTOR SIZE 8 END */

170

#if VECTOR_SIZE >= 4

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

171

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

172

#endif /* VECTOR SIZE 4 END */

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

173

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

174

175

// Store result

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

176

*((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

// Second part

// Load max value of 1D logits vector (row)

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

181

int max_val = convert_int(*((__global DATA_TYPE *)offset(&maxo, 0, 0)));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

182

183

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

184

VEC_INT sum1D = 0;

185

186

// Shift values, exp and sum

187

for(uint i = 0; i < width4; i++)

188

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

189

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

190

VEC_INT data_fp = CONVERT(data, VEC_INT);

191

VEC_INT data_diff = data_fp - max_val;

192

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

193

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

194

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

195

VSTORE(VECTOR_SIZE)

196

(data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

197

sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

198

}

199

200

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

201

// Handle non multiple of 16

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

202

data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

203

VEC_INT data_fp = CONVERT(data, VEC_INT);

204

VEC_INT data_diff = data_fp - max_val;

205

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

206

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

207

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

208

VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);

209

VSTORE(VECTOR_SIZE)

210

(data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

211

data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

212

sum1D = sum1D + select(MIN_VALUE, data_fp, widx_);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

213

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

214

215

// Perform sum reduction

216

#if VECTOR_SIZE == 16

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

217

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

218

#endif /* VECTOR SIZE 16 END */

219

#if VECTOR_SIZE >= 8

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

220

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

221

#endif /* VECTOR SIZE 8 END */

222

#if VECTOR_SIZE >= 4

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

223

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

224

#endif /* VECTOR SIZE 4 END */

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

225

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

226

227

// Calculate and store result

228

*((__global int *)sum.ptr) = sum1D.s0;

229

}

230

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

231

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

232

* then gets the exponent of each element as sums all elements across each row.

233

*

234

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

235

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

236

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

237

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

238

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

239

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

240

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

241

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

242

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

243

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

244

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

245

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

246

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

247

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

248

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

249

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

250

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

251

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

252

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

253

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

254

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

255

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

256

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

257

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

258

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

259

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

260

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

261

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

262

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

263

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

264

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

265

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

266

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

267

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

268

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

269

* @param[in] width Input image width

270

*/

271

__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(

272

TENSOR3D_DECLARATION(src),

273

TENSOR3D_DECLARATION(maxo),

274

TENSOR3D_DECLARATION(dst),

275

TENSOR3D_DECLARATION(sum),

276

uint width)

277

{

278

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

279

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

280

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

281

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

282

283

const uint4 idx4 = (uint4)(0, 1, 2, 3);

284

const uint lid = get_local_id(0);

285

286

// Define one temporary vector per work-item.

287

__local int4 tmp_local[GRID_SIZE];

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

288

__local DATA_TYPE max_local;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

289

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

290

VEC_DATA_TYPE(DATA_TYPE, 4)

291

vec_min_val = (VEC_DATA_TYPE(DATA_TYPE, 4))(MIN_VALUE);

292

VEC_DATA_TYPE(DATA_TYPE, 4)

293

max_val_vec = vec_min_val;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

294

295

// Number of elements per work-item.

296

const uint row = width / GRID_SIZE;

297

// Number of iterations per work-item.

298

const uint width_ = row >> 2;

299

// Calculate max of row

300

uint i = 0;

301

for(; i < width_; i++)

302

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

303

VEC_DATA_TYPE(DATA_TYPE, 4)

304

data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

305

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

306

}

307

#ifdef NON_MULTIPLE_OF_GRID_SIZE

308

// How many work-items needed to complete the computation.

309

//TODO: Optimize this calculation (avoid %).

310

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

311

if(lid < boundary_workitems)

312

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

313

VEC_DATA_TYPE(DATA_TYPE, 4)

314

data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

315

max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

316

}

317

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

318

if(boundary_workitems == 0)

319

{

320

boundary_workitems = GRID_SIZE;

321

i--;

322

}

323

if(lid == (boundary_workitems - 1))

324

{

325

// Handle non multiple of 4

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

326

VEC_DATA_TYPE(DATA_TYPE, 4)

327

data_max = vload4(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

328

VEC_DATA_TYPE(DATA_TYPE, 4)

329

widx = CONVERT((((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width), VEC_DATA_TYPE(DATA_TYPE, 4));

330

max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data_max, widx), DATA_TYPE, 4);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

331

}

332

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

333

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

334

tmp_local[lid] = convert_int4(max_val_vec);

335

336

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

343

}

344

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

351

}

352

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

359

}

360

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

367

}

368

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

375

}

376

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

383

}

384

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

391

}

392

barrier(CLK_LOCAL_MEM_FENCE);

393

}

394

if(lid == 0)

395

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

396

max_val_vec = MAX_OP(CONVERT((tmp_local[lid + 1]), VEC_DATA_TYPE(DATA_TYPE, 4)), CONVERT((tmp_local[lid]), VEC_DATA_TYPE(DATA_TYPE, 4)), DATA_TYPE, 4);

397

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);

398

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

399

max_local = max_val_vec.s0;

400

}

401

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

int4 sum1D = 0;

int max_val = convert_int(max_local);

408

409

// Shift values, exp and sum

410

for(i = 0; i < width_; i++)

411

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

412

VEC_DATA_TYPE(DATA_TYPE, 4)

413

data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

414

int4 data_fp = convert_int4(data);

415

int4 data_diff = data_fp - max_val;

416

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

417

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

418

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

419

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

420

sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

421

}

422

#ifdef NON_MULTIPLE_OF_GRID_SIZE

423

//TODO: Optimize the calculation (avoid %).

424

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

425

if(lid < boundary_workitems)

426

{

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

427

VEC_DATA_TYPE(DATA_TYPE, 4)

428

data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

429

int4 data_fp = convert_int4(data);

430

int4 data_diff = data_fp - max_val;

431

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

432

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

433

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

434

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

435

sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

436

}

437

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

438

if(boundary_workitems == 0)

439

{

440

boundary_workitems = GRID_SIZE;

441

i--;

442

}

443

if(lid == (boundary_workitems - 1))

444

{

445

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

446

VEC_DATA_TYPE(DATA_TYPE, 4)

447

data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4 + 4, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

448

int4 data_fp = convert_int4(data);

449

int4 data_diff = data_fp - max_val;

450

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

451

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

452

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

453

int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

454

data_fp = select(MIN_VALUE, data_fp, widx);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

455

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

456

sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

457

}

458

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

459

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

460

tmp_local[lid] = sum1D;

461

462

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

469

}

470

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

477

}

478

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

485

}

486

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

493

}

494

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

501

}

502

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

509

}

510

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

517

}

518

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);

523

// Perform max reduction

524

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);

525

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);

526

*((__global int *)sum.ptr) = sum1D.s0;

}

}

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

530

/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

531

*

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

532

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

533

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

534

*

535

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32

536

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

537

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

538

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

539

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

540

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

541

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

542

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

543

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

544

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

545

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

546

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

547

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

548

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

549

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

550

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

551

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8

552

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

553

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

554

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

555

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

556

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

557

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

558

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

559

*/

560

__kernel void softmax_layer_norm_quantized(

561

TENSOR3D_DECLARATION(src),

562

TENSOR3D_DECLARATION(sum),

563

TENSOR3D_DECLARATION(dst))

564

{

565

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

566

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

567

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

568

569

// Load max value of 1D logits vector (row)

570

int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));

571

572

// It will be better to calculate this in prev layer and pass here as parameter

Giorgio Arena

af724e7

2019-11-12 14:45:14 +0000

[diff] [blame]

573

uint sum_val_u = convert_uint(sum_val);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

574

int headroom_plus_one = clz(sum_val_u);

575

int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;

576

int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));

577

int16 shifted_sum_minus_one = shifted_sum_minus_one_1;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

578

int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

579

580

// It was already calculated in prev layer, should be stored into tmp output and reused

581

int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

582

int16 data_diff_mult = data_diff;

583

#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)

584

if(INPUT_BETA_MULTIPLIER > 1)

585

{

586

data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);

587

}

588

#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

589

Giorgio Arena

af724e7

2019-11-12 14:45:14 +0000

[diff] [blame]

590

int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);

591

data = ASYMM_MULT(shifted_scale, data, 16);

592

data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

593

#ifdef QASYMM8_SIGNED

Sang-Hoon Park

a0205b9

2020-07-07 09:36:09 +0100

[diff] [blame]

594

data = ADD_OP(data, (int16)(MIN_VALUE), int, 16);

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

595

#endif /* QASYMM8_SIGNED */

Sang-Hoon Park

a0205b9

2020-07-07 09:36:09 +0100

[diff] [blame]

596

data = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));

Sang-Hoon Park

2019-11-13 17:08:12 +0000

[diff] [blame]

597

vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));

Chunosov