Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

95

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

96

*

97

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8

98

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

99

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

100

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

101

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

102

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

103

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

104

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

105

* @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

106

* @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)

107

* @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

108

* @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)

109

* @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

110

* @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)

111

* @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

112

* @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor

113

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32

114

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

115

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

116

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

117

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

118

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

119

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

120

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

121

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr

122

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

123

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

124

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

125

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

126

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

127

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

128

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

129

* @param[in] width Input image width

130

*/

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

131

__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

132

TENSOR3D_DECLARATION(src),

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

133

TENSOR3D_DECLARATION(maxo),

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

134

TENSOR3D_DECLARATION(dst),

135

TENSOR3D_DECLARATION(sum),

136

uint width)

137

{

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

138

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

139

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

140

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

141

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

142

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

143

VEC_UCHAR max_val_vec = 0;

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

144

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

145

// Calculate max of row

146

const uint width4 = width >> LOG_VECTOR_SIZE;

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

147

for(uint i = 0; i < width4; i++)

148

{

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

149

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

150

max_val_vec = MAX_OP(data, max_val_vec, uchar, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

151

}

152

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

153

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

154

// Handle non multiple of 16

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

155

VEC_UCHAR uchar_min = (VEC_UCHAR)0;

156

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

157

VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);

158

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);

159

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

160

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

161

// Perform max reduction

162

#if VECTOR_SIZE == 16

163

max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);

164

#endif /* VECTOR SIZE 16 END */

165

#if VECTOR_SIZE >= 8

166

max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);

167

#endif /* VECTOR SIZE 8 END */

168

#if VECTOR_SIZE >= 4

169

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

170

#endif /* VECTOR SIZE 4 END */

171

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

172

173

// Store result

174

*((__global uchar *)maxo.ptr) = max_val_vec.s0;

// Second part

// Load max value of 1D logits vector (row)

179

int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));

180

181

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

182

VEC_INT sum1D = 0;

183

184

// Shift values, exp and sum

185

for(uint i = 0; i < width4; i++)

186

{

187

VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));

188

VEC_INT data_fp = CONVERT(data, VEC_INT);

189

VEC_INT data_diff = data_fp - max_val;

190

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

191

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

192

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

193

VSTORE(VECTOR_SIZE)

194

(data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));

195

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

196

}

197

198

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

199

// Handle non multiple of 16

200

data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));

201

VEC_INT data_fp = CONVERT(data, VEC_INT);

202

VEC_INT data_diff = data_fp - max_val;

203

VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);

204

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

205

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

206

VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);

207

VSTORE(VECTOR_SIZE)

208

(data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));

209

data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

210

sum1D = sum1D + select(0, data_fp, widx_);

211

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

212

213

// Perform sum reduction

214

#if VECTOR_SIZE == 16

215

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);

216

#endif /* VECTOR SIZE 16 END */

217

#if VECTOR_SIZE >= 8

218

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);

219

#endif /* VECTOR SIZE 8 END */

220

#if VECTOR_SIZE >= 4

221

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);

222

#endif /* VECTOR SIZE 4 END */

223

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

224

225

// Calculate and store result

226

*((__global int *)sum.ptr) = sum1D.s0;

227

}

228

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

229

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

230

* then gets the exponent of each element as sums all elements across each row.

231

*

232

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

233

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

234

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

235

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

236

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

237

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

238

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

239

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

240

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

241

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

242

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

243

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

244

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

245

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

246

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

247

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

248

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

249

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

250

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

251

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

252

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

253

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

254

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

255

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

256

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

257

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

258

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

259

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

260

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

261

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

262

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

263

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

264

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

265

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

266

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

267

* @param[in] width Input image width

268

*/

269

__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(

270

TENSOR3D_DECLARATION(src),

271

TENSOR3D_DECLARATION(maxo),

272

TENSOR3D_DECLARATION(dst),

273

TENSOR3D_DECLARATION(sum),

274

uint width)

275

{

276

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

277

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

278

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

279

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

280

281

const uint4 idx4 = (uint4)(0, 1, 2, 3);

282

const uint lid = get_local_id(0);

283

284

// Define one temporary vector per work-item.

285

__local int4 tmp_local[GRID_SIZE];

286

__local uchar max_local;

287

288

uchar4 uchar_min = (uchar4)0;

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

289

uchar4 max_val_vec = uchar_min;

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

290

291

// Number of elements per work-item.

292

const uint row = width / GRID_SIZE;

293

// Number of iterations per work-item.

294

const uint width_ = row >> 2;

295

// Calculate max of row

296

uint i = 0;

297

for(; i < width_; i++)

298

{

299

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

300

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

301

}

302

#ifdef NON_MULTIPLE_OF_GRID_SIZE

303

// How many work-items needed to complete the computation.

304

//TODO: Optimize this calculation (avoid %).

305

int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

306

if(lid < boundary_workitems)

307

{

308

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

309

max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);

310

}

311

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

312

if(boundary_workitems == 0)

313

{

314

boundary_workitems = GRID_SIZE;

315

i--;

316

}

317

if(lid == (boundary_workitems - 1))

318

{

319

// Handle non multiple of 4

320

uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));

321

uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

322

max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);

323

}

324

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

325

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

326

tmp_local[lid] = convert_int4(max_val_vec);

327

328

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

335

}

336

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

343

}

344

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

351

}

352

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

359

}

360

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

367

}

368

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

375

}

376

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

383

}

384

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

max_val_vec = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);

389

max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);

390

max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);

391

max_local = max_val_vec.s0;

392

}

393

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

int4 sum1D = 0;

int max_val = convert_int(max_local);

400

401

// Shift values, exp and sum

402

for(i = 0; i < width_; i++)

403

{

404

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

405

int4 data_fp = convert_int4(data);

406

int4 data_diff = data_fp - max_val;

407

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

408

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

409

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

410

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

411

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

412

}

413

#ifdef NON_MULTIPLE_OF_GRID_SIZE

414

//TODO: Optimize the calculation (avoid %).

415

boundary_workitems = (width % (GRID_SIZE * 4)) / 4;

416

if(lid < boundary_workitems)

417

{

418

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));

419

int4 data_fp = convert_int4(data);

420

int4 data_diff = data_fp - max_val;

421

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

422

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

423

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

424

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));

425

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

426

}

427

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

428

if(boundary_workitems == 0)

429

{

430

boundary_workitems = GRID_SIZE;

431

i--;

432

}

433

if(lid == (boundary_workitems - 1))

434

{

435

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

436

uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));

437

int4 data_fp = convert_int4(data);

438

int4 data_diff = data_fp - max_val;

439

int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);

440

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);

441

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);

Giorgio Arena

72f39be

2018-02-19 15:33:41 +0000

[diff] [blame]

442

int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);

443

data_fp = select(0, data_fp, widx);

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

444

vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));

445

sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));

446

}

447

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

448

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

449

tmp_local[lid] = sum1D;

450

451

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);

458

}

459

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);

466

}

467

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);

474

}

475

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);

482

}

483

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);

490

}

491

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);

498

}

499

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);

506

}

507

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);

512

// Perform max reduction

513

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);

514

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);

515

*((__global int *)sum.ptr) = sum1D.s0;

}

}

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

519

/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

520

*

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

521

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

522

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

523

*

524

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32

525

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

526

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

527

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

528

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

529

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

530

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

531

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

532

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

533

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

534

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

535

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

536

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

537

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

538

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

539

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

540

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8

541

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

542

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

543

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

544

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

545

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

546

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

547

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

548

*/

549

__kernel void softmax_layer_norm_quantized(

550

TENSOR3D_DECLARATION(src),

551

TENSOR3D_DECLARATION(sum),

552

TENSOR3D_DECLARATION(dst))

553

{

554

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

555

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

556

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

557

558

// Load max value of 1D logits vector (row)

559

int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));

560

561

// It will be better to calculate this in prev layer and pass here as parameter

562

uint sum_val_u = convert_uint(sum_val);

563

int headroom_plus_one = clz(sum_val_u);

564

int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;

565

int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));

566

int16 shifted_sum_minus_one = shifted_sum_minus_one_1;

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

567

int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

568

569

// It was already calculated in prev layer, should be stored into tmp output and reused

570

int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

571

int16 data_diff_mult = data_diff;

572

#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)

573

if(INPUT_BETA_MULTIPLIER > 1)

574

{

575

data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);

576

}

577

#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */

578

int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

579

Giorgio Arena

4402cb9

2018-02-15 13:37:40 +0000

[diff] [blame]

580

data = ASYMM_MULT(shifted_scale, data, 16);

581

data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);

Georgios Pinitas

47b5603

2017-11-29 16:09:48 +0000

[diff] [blame]

582

data = select(0, data, data_diff >= (int16)(DIFF_MIN));

583

vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));

Chunosov

f450caa

2017-11-08 16:09:35 +0700

[diff] [blame]

584

}

585

586

#endif /* defined(DIFF_MIN) */