Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

95

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

96

*

97

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8

98

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

99

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

100

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

101

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

102

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

103

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

104

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

105

* @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

106

* @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)

107

* @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

108

* @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)

109

* @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

110

* @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)

111

* @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

112

* @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor

113

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32

114

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

115

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

116

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

117

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

118

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

119

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

120

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

121

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr

122

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

123

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

124

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

125

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

126

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

127

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

128

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

129

* @param[in] width Input image width

130

*/

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

131

__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

132

TENSOR3D_DECLARATION(src),

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

133

TENSOR3D_DECLARATION(maxo),

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

134

TENSOR3D_DECLARATION(dst),

135

TENSOR3D_DECLARATION(sum),

136

uint width)

137

{

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

138

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

139

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

140

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

141

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

142

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

143

VEC_UCHAR max_val_vec = 0;

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

144

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

145

// Calculate max of row

146

const uint width4 = width >> LOG_VECTOR_SIZE;

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

147

for(uint i = 0; i < width4; i++)

148

{

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

149

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

150

max_val_vec = MAX_OP(data, max_val_vec, uchar, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

151

}

152

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

153

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

154

// Handle non multiple of 16

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

155

VEC_UCHAR uchar_min = (VEC_UCHAR)0;

156

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

157

VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);

158

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);

159

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

160

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

161

// Perform max reduction

162

#if VECTOR_SIZE == 16

163

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);

164

#endif /* VECTOR SIZE 16 END */

165

#if VECTOR_SIZE >= 8

166

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);

167

#endif /* VECTOR SIZE 8 END */

168

#if VECTOR_SIZE >= 4

169

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

170

#endif /* VECTOR SIZE 4 END */

171

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

172

173

// Store result

174

*((__global uchar *)maxo.ptr) = max_val_vec.s0;

// Second part

// Load max value of 1D logits vector (row)

179

int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));

180

181

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

182

VEC_INT sum1D = 0;

183

184

// Shift values, exp and sum

185

for(uint i = 0; i < width4; i++)

186

{

187

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

188

VEC_INT data_fp = CONVERT(data, VEC_INT);

189

VEC_INT data_diff = data_fp - max_val;

190

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

191

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

192

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

193

VSTORE(VECTOR_SIZE)

194

(data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

195

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

196

}

197

198

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

199

// Handle non multiple of 16

200

data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

201

VEC_INT data_fp = CONVERT(data, VEC_INT);

202

VEC_INT data_diff = data_fp - max_val;

203

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

204

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

205

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

206

VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);

207

VSTORE(VECTOR_SIZE)

208

(data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));

209

data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

210

sum1D = sum1D + select(0, data_fp, widx_);

211

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

212

213

// Perform sum reduction

214

#if VECTOR_SIZE == 16

215

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);

216

#endif /* VECTOR SIZE 16 END */

217

#if VECTOR_SIZE >= 8

218

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);

219

#endif /* VECTOR SIZE 8 END */

220

#if VECTOR_SIZE >= 4

221

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);

222

#endif /* VECTOR SIZE 4 END */

223

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

224

225

// Calculate and store result

226

*((__global int *)sum.ptr) = sum1D.s0;

227

}

228

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

229

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

230

* then gets the exponent of each element as sums all elements across each row.

231

*

232

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

233

* @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4

234

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

235

*

236

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32

237

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

238

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

239

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

240

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

241

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

242

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

243

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

244

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

245

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

246

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

247

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

248

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

249

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

250

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

251

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

252

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

253

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

254

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

255

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

256

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

257

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

258

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

259

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

260

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

261

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

262

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

263

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

264

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

265

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

266

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

267

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

268

* @param[in] width Input image width

269

*/

270

__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(

271

TENSOR3D_DECLARATION(src),

272

TENSOR3D_DECLARATION(maxo),

273

TENSOR3D_DECLARATION(dst),

274

TENSOR3D_DECLARATION(sum),

275

uint width)

276

{

277

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

278

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

279

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

280

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

281

282

const uint4 idx4 = (uint4)(0, 1, 2, 3);

283

const uint lid = get_local_id(0);

284

285

// Define one temporary vector per work-item.

286

__local int4 tmp_local[GRID_SIZE];

287

__local uchar max_local;

288

289

uchar4 uchar_min = (uchar4)0;

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

290

uchar4 max_val_vec = uchar_min;

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

291

292

// Number of elements per work-item.

293

const uint row = width / GRID_SIZE;

294

// Number of iterations per work-item.

295

const uint width_ = row >> 2;

296

// Calculate max of row

297

uint i = 0;

298

for(; i < width_; i++)

299

{

300

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

301

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

302

}

303

#ifdef NON_MULTIPLE_OF_GRID_SIZE

304

// How many work-items needed to complete the computation.

305

//TODO: Optimize this calculation (avoid %).

306

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

307

if(lid < boundary_workitems)

308

{

309

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

310

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

311

}

312

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

313

if(boundary_workitems == 0)

314

{

315

boundary_workitems = GRID_SIZE;

316

i--;

317

}

318

if(lid == (boundary_workitems - 1))

319

{

320

// Handle non multiple of 4

321

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

322

uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

323

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);

324

}

325

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

326

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

327

tmp_local[lid] = convert_int4(max_val_vec);

328

329

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

336

}

337

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

344

}

345

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

352

}

353

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

360

}

361

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

368

}

369

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

376

}

377

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

384

}

385

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);

390

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

391

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

392

max_local = max_val_vec.s0;

393

}

394

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

int4 sum1D = 0;

int max_val = convert_int(max_local);

401

402

// Shift values, exp and sum

403

for(i = 0; i < width_; i++)

404

{

405

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

406

int4 data_fp = convert_int4(data);

407

int4 data_diff = data_fp - max_val;

408

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

409

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

410

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

411

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

412

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

413

}

414

#ifdef NON_MULTIPLE_OF_GRID_SIZE

415

//TODO: Optimize the calculation (avoid %).

416

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

417

if(lid < boundary_workitems)

418

{

419

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

420

int4 data_fp = convert_int4(data);

421

int4 data_diff = data_fp - max_val;

422

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

423

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

424

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

425

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

426

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

427

}

428

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

429

if(boundary_workitems == 0)

430

{

431

boundary_workitems = GRID_SIZE;

432

i--;

433

}

434

if(lid == (boundary_workitems - 1))

435

{

436

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

437

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));

438

int4 data_fp = convert_int4(data);

439

int4 data_diff = data_fp - max_val;

440

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

441

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

442

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

443

int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

444

data_fp = select(0, data_fp, widx);

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

445

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));

446

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

447

}

448

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

449

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

450

tmp_local[lid] = sum1D;

451

452

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

459

}

460

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

467

}

468

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

475

}

476

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

483

}

484

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

491

}

492

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

499

}

500

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

507

}

508

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);

513

// Perform max reduction

514

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);

515

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);

516

*((__global int *)sum.ptr) = sum1D.s0;

}

}

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

520

/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

521

*

522

* @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4

523

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

524

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

525

*

526

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32

527

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

528

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

529

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

530

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

531

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

532

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

533

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

534

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

535

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

536

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

537

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

538

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

539

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

540

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

541

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

542

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8

543

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

544

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

545

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

546

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

547

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

548

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

549

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

550

*/

551

__kernel void softmax_layer_norm_quantized(

552

TENSOR3D_DECLARATION(src),

553

TENSOR3D_DECLARATION(sum),

554

TENSOR3D_DECLARATION(dst))

555

{

556

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

557

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

558

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

559

560

// Load max value of 1D logits vector (row)

561

int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));

562

563

// It will be better to calculate this in prev layer and pass here as parameter

564

uint sum_val_u = convert_uint(sum_val);

565

int headroom_plus_one = clz(sum_val_u);

566

int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;

567

int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));

568

int16 shifted_sum_minus_one = shifted_sum_minus_one_1;

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

569

int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

570

571

// It was already calculated in prev layer, should be stored into tmp output and reused

572

int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

573

int16 data_diff_mult = data_diff;

574

#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)

575

if(INPUT_BETA_MULTIPLIER > 1)

576

{

577

data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);

578

}

579

#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */

580

int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

581

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

582

data = ASYMM_MULT(shifted_scale, data, 16);

583

data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);

Georgios Pinitas

47b5603

2017-11-29 16:09:48 +0000

[diff] [blame]

584

data = select(0, data, data_diff >= (int16)(DIFF_MIN));

585

vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

586

}

587

588

#endif /* defined(DIFF_MIN) */