Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

2017-11-08 16:09:35 +0700

[diff] [blame]

113

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

114

/* Number of workitems in dimension 0. */

115

#if !defined(GRID_SIZE)

116

#define GRID_SIZE 1

117

#endif /* !defined(GRID_SIZE) */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

118

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

119

#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

120

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

121

VEC_INT mult_by_quantized_multiplier(VEC_INT data)

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

122

{

123

#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)

124

if(INPUT_BETA_MULTIPLIER > 1)

125

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

126

return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

127

}

128

#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */

return data;

}

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

132

/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,

133

* then gets the exponent of each element as sums all elements across each row.

134

*

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

135

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar

136

* @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128

137

* @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16

138

* @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE

139

* @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

140

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

141

* @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

142

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

143

* @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

144

*

Michele Di Giorgio

f6f7876

2020-07-06 11:27:21 +0100

[diff] [blame]

145

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

146

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

147

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

148

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

149

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

150

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

151

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

152

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

153

* @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

154

* @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)

155

* @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

156

* @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)

157

* @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

158

* @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)

159

* @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

160

* @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor

161

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32

162

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

163

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

164

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

165

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

166

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

167

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

168

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

169

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr

170

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

171

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

172

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

173

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

174

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

175

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

176

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

177

*/

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

178

__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

179

TENSOR3D_DECLARATION(src),

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

180

TENSOR3D_DECLARATION(maxo),

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

181

TENSOR3D_DECLARATION(dst),

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

182

TENSOR3D_DECLARATION(sum))

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

183

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

184

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;

185

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;

186

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

187

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

188

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

189

Sang-Hoon Park

0779fec

2019-11-13 17:08:12 +0000

[diff] [blame]

190

VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

191

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

192

// Calculate max of row

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

193

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Sang-Hoon Park

0779fec

2019-11-13 17:08:12 +0000

[diff] [blame]

194

VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

195

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);

196

VEC_INT widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);

197

max_val_vec = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

198

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

199

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

200

for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)

201

{

202

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));

203

max_val_vec = max(data, max_val_vec);

204

}

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

205

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

206

// Perform max reduction

207

DATA_TYPE max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);

208

*((__global DATA_TYPE *)maxo.ptr) = max_local;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

// Second part

// Load max value of 1D logits vector (row)

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

213

int max_val = convert_int(max_local);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

214

215

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

216

VEC_INT sum1D = 0;

217

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

218

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

219

VEC_INT data_fp = CONVERT(data, VEC_INT);

220

VEC_INT data_diff = data_fp - max_val;

221

VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);

222

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);

223

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);

224

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

225

(data_diff, 0, (__global int *)dst_addr);

226

data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

227

sum1D += select(0, data_fp, widx);

228

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

229

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

230

// Shift values, exp and sum

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

231

for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

232

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

233

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

234

VEC_INT data_fp = CONVERT(data, VEC_INT);

235

VEC_INT data_diff = data_fp - max_val;

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

236

VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);

237

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);

238

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

239

VSTORE(VECTOR_SIZE)

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

240

(data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));

Sang-Hoon Park

77d3d24

2020-08-10 22:50:17 +0100

[diff] [blame]

241

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

242

}

243

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

244

// Perform sum reduction

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

245

*((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);

Chunosov

2017-11-08 16:09:35 +0700

[diff] [blame]

246

}

247

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

248

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

249

* then gets the exponent of each element as sums all elements across each row.

250

*

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

251

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar

252

* @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128

253

* @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16

254

* @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

255

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

256

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

257

* @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.

258

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

259

* @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

260

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

261

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

262

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

263

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

264

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

265

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

266

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

267

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

268

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

269

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

270

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

271

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

272

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

273

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

274

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

275

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

276

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

277

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

278

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

279

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

280

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

281

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

282

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

283

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

284

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

285

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

286

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

287

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

288

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

289

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

290

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

291

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

292

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

293

*/

294

__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(

295

TENSOR3D_DECLARATION(src),

296

TENSOR3D_DECLARATION(maxo),

297

TENSOR3D_DECLARATION(dst),

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

298

TENSOR3D_DECLARATION(sum))

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

299

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

300

const uint lid = get_local_id(0);

301

const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);

302

303

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;

304

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;

305

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

306

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

307

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

308

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

309

// Define one temporary vector per work-item.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

310

__local VEC_INT tmp_local[GRID_SIZE];

Sang-Hoon Park

0779fec

2019-11-13 17:08:12 +0000

[diff] [blame]

311

__local DATA_TYPE max_local;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

312

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

313

VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);

314

VEC_BASE max_val_vec = vec_min_val;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

315

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

316

// Number of iterations per work-item.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

317

const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

318

// Calculate max of row

319

uint i = 0;

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

320

for(; i < width; ++i)

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

321

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

322

VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

323

max_val_vec = max(data_max, max_val_vec);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

324

}

325

#ifdef NON_MULTIPLE_OF_GRID_SIZE

326

// How many work-items needed to complete the computation.

327

//TODO: Optimize this calculation (avoid %).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

328

int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

329

if(lid < boundary_workitems)

330

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

331

VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

332

max_val_vec = max(data_max, max_val_vec);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

333

}

334

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

335

VEC_INT widx;

336

if(lid == 0)

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

337

{

338

// Handle non multiple of 4

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

339

VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

340

widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);

341

max_val_vec = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

342

}

343

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

344

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

345

tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

346

347

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

353

tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

354

}

355

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

361

tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

362

}

363

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

369

tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

370

}

371

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

377

tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

378

}

379

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

385

tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

386

}

387

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

393

tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

394

}

395

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

401

tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

402

}

403

barrier(CLK_LOCAL_MEM_FENCE);

404

}

405

if(lid == 0)

406

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

407

max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));

408

max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

409

}

410

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

415

VEC_INT sum1D = 0;

416

int max_val = convert_int(max_local);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

417

418

// Shift values, exp and sum

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

419

for(i = 0; i < width; ++i)

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

420

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

421

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

422

VEC_INT data_fp = CONVERT(data, VEC_INT);

423

VEC_INT data_diff = data_fp - max_val;

424

VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);

425

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);

426

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);

427

VSTORE(VECTOR_SIZE)

428

(data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));

429

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

430

}

431

#ifdef NON_MULTIPLE_OF_GRID_SIZE

432

//TODO: Optimize the calculation (avoid %).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

433

boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

434

if(lid < boundary_workitems)

435

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

436

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

437

VEC_INT data_fp = CONVERT(data, VEC_INT);

438

VEC_INT data_diff = data_fp - max_val;

439

VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);

440

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);

441

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);

442

VSTORE(VECTOR_SIZE)

443

(data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));

444

sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

445

}

446

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

447

if(lid == 0)

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

448

{

449

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

450

VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

451

VEC_INT data_fp = CONVERT(data, VEC_INT);

452

VEC_INT data_diff = data_fp - max_val;

453

VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);

454

data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);

455

data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);

456

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

457

(data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));

458

data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));

Sang-Hoon Park

77d3d24

2020-08-10 22:50:17 +0100

[diff] [blame]

459

data_fp = select(0, data_fp, widx);

460

sum1D = sum1D + data_fp;

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

461

}

462

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

463

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

464

tmp_local[lid] = sum1D;

465

466

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

472

tmp_local[lid] += tmp_local[lid + 128];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

473

}

474

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

480

tmp_local[lid] += tmp_local[lid + 64];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

481

}

482

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

488

tmp_local[lid] += tmp_local[lid + 32];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

489

}

490

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

496

tmp_local[lid] += tmp_local[lid + 16];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

497

}

498

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

504

tmp_local[lid] += tmp_local[lid + 8];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

505

}

506

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

512

tmp_local[lid] += tmp_local[lid + 4];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

513

}

514

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

520

tmp_local[lid] += tmp_local[lid + 2];

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

521

}

522

barrier(CLK_LOCAL_MEM_FENCE);

523

}

524

if(lid == 0)

525

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

526

sum1D = (tmp_local[lid + 1] + tmp_local[lid]);

527

// Perform sum reduction

528

*((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);

Giorgio Arena

2018-02-15 13:37:40 +0000

[diff] [blame]

529

}

530

}

Giorgio Arena