Blame - src/core/CL/cl_kernels/softmax_layer.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

35

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

36

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

37

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

38

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

39

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

40

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

steniu01

2017-07-13 14:24:23 +0100

[diff] [blame]

41

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

42

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

43

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Georgios Pinitas

e5f8fd6

2017-06-23 18:03:44 +0100

[diff] [blame]

44

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

45

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

46

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

47

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

48

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

steniu01

2017-07-13 14:24:23 +0100

[diff] [blame]

49

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

50

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

51

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

Georgios Pinitas

e5f8fd6

2017-06-23 18:03:44 +0100

[diff] [blame]

52

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

53

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

54

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

55

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

56

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

steniu01

2017-07-13 14:24:23 +0100

[diff] [blame]

57

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

58

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

59

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

60

*/

61

__kernel void softmax_layer_norm(

steniu01

2017-07-13 14:24:23 +0100

[diff] [blame]

62

TENSOR3D_DECLARATION(src),

63

TENSOR3D_DECLARATION(sum),

64

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

65

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

66

const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);

67

68

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;

69

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;

70

steniu01

2017-07-13 14:24:23 +0100

[diff] [blame]

71

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

72

73

// Load max value of 1D logits vector (row)

74

DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

75

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

76

data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);

77

78

#if defined(LOG_SOFTMAX)

Sang-Hoon Park

a0205b9

2020-07-07 09:36:09 +0100

[diff] [blame]

79

sum_val = log(sum_val);

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

80

data0 -= sum_val;

81

#else // defined(LOG_SOFTMAX)

82

data0 /= sum_val;

83

#endif // defined(LOG_SOFTMAX)

84

85

STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

86

}

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

87

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

88

#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)

89

90

/* Number of workitems in dimension 0. */

91

#if !defined(GRID_SIZE)

92

#define GRID_SIZE 1

93

#endif /* !defined(GRID_SIZE) */

94

95

#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

96

#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

97

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

98

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

99

* then gets the exponent of each element as sums all elements across each row.

100

*

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

101

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float

102

* @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0

103

* @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16

104

* @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

105

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

106

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

107

* @note In case of log softmax, -DLOG_SOFTMAX must be passed.

108

* @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

109

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

110

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

111

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

112

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

113

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

114

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

115

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

116

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

117

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

118

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

119

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

120

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

121

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

122

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

123

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

124

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

125

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

126

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

127

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

128

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

129

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

130

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

131

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

132

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

133

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

134

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

135

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

136

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

137

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

138

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

139

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

140

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

141

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

142

*/

143

__kernel void softmax_layer_max_shift_exp_sum_serial(

144

TENSOR3D_DECLARATION(src),

145

TENSOR3D_DECLARATION(maxo),

146

TENSOR3D_DECLARATION(dst),

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

147

TENSOR3D_DECLARATION(sum))

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

148

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

149

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;

150

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;

151

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

152

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

153

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

154

155

#ifdef BETA

156

// Initialize beta

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

157

VEC_TYPE beta = (VEC_TYPE)BETA;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

158

#endif /* BETA */

159

160

// Initialize local maximum

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

161

VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

162

163

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

164

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);

165

SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);

166

max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

167

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

168

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

169

for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)

170

{

171

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));

172

max_val_vec = max(data, max_val_vec);

173

}

174

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

175

// Perform max reduction

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

176

DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE);

177

*((__global DATA_TYPE *)maxo.ptr) = max_val;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

/* Second section */

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

181

// Set sum vector

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

182

VEC_TYPE sum1D = 0;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

183

184

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

185

data -= max_val;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

186

#ifdef BETA

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

187

data *= beta;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

188

#endif /* BETA */

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

189

#ifdef LOG_SOFTMAX

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

190

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

191

(data, 0, (__global DATA_TYPE *)dst_addr);

192

data = exp(data);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

193

data = select(0, data, widx);

194

#else /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

195

data = exp(data);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

196

data = select(0, data, widx);

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

197

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

198

(data, 0, (__global DATA_TYPE *)dst_addr);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

199

#endif /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

200

sum1D += data;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

201

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

202

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

203

// Shift values, exp and sum

204

for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)

205

{

206

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));

data -= max_val;

#ifdef BETA

data *= beta;

#endif /* BETA */

#ifdef LOG_SOFTMAX

VSTORE(VECTOR_SIZE)

(data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));

214

data = exp(data);

215

#else /* LOG_SOFTMAX */

216

data = exp(data);

217

VSTORE(VECTOR_SIZE)

218

(data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));

219

#endif /* LOG_SOFTMAX */

220

sum1D += data;

221

}

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

222

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

223

// Perform sum reduction

224

*((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

225

}

226

227

/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,

228

* then gets the exponent of each element as sums all elements across each row.

229

*

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

230

* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float

231

* @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0

232

* @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16

233

* @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

234

* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.

235

* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

236

* @note In case of log softmax, -DLOG_SOFTMAX must be passed.

237

* @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

238

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

239

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

240

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

241

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

242

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

243

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

244

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

245

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

246

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

247

* @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

248

* @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)

249

* @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

250

* @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)

251

* @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

252

* @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)

253

* @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

254

* @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor

255

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr

256

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

257

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

258

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

259

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

260

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

261

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

262

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

263

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

264

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

265

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

266

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

267

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

268

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

269

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

270

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

271

*/

272

__kernel void softmax_layer_max_shift_exp_sum_parallel(

273

TENSOR3D_DECLARATION(src),

274

TENSOR3D_DECLARATION(maxo),

275

TENSOR3D_DECLARATION(dst),

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

276

TENSOR3D_DECLARATION(sum))

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

277

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

278

const uint lid = get_local_id(0);

279

const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);

280

281

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;

282

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;

283

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

284

Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);

285

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

286

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

287

#ifdef BETA

288

// Initialize beta

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

289

VEC_TYPE beta = (VEC_TYPE)BETA;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

290

#endif /* BETA */

291

292

// Define one temporary vector per work-item.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

293

__local VEC_TYPE tmp_local[GRID_SIZE];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

294

__local DATA_TYPE max_local;

295

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

296

VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);

297

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

298

// Number of iterations per work-item.

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

299

const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

300

// Calculate max of row

301

uint i = 0;

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

302

for(; i < width; ++i)

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

303

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

304

VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

305

max_val_vec = max(data_max, max_val_vec);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

306

}

307

#ifdef NON_MULTIPLE_OF_GRID_SIZE

308

// How many work-items needed to complete the computation.

309

//TODO: Optimize this calculation (avoid %).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

310

int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

311

if(lid < boundary_workitems)

312

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

313

VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

314

max_val_vec = max(data_max, max_val_vec);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

315

}

316

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

317

SELECT_TYPE widx;

318

if(lid == 0)

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

319

{

320

// Handle non multiple of 4

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

321

VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

322

widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);

323

max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

324

}

325

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

326

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

327

tmp_local[lid] = max_val_vec;

328

329

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

335

tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

336

}

337

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

343

tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

344

}

345

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

351

tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

352

}

353

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

359

tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

360

}

361

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

367

tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

368

}

369

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

375

tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

376

}

377

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

383

tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

384

}

385

barrier(CLK_LOCAL_MEM_FENCE);

386

}

387

if(lid == 0)

388

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

389

max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);

390

max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

391

}

392

barrier(CLK_LOCAL_MEM_FENCE);

/* Second section */

// Set sum vector

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

397

VEC_TYPE sum1D = 0;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

398

DATA_TYPE max_val = max_local;

399

400

// Shift values, exp and sum

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

401

for(i = 0; i < width; ++i)

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

402

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

403

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

404

data -= max_val;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

405

#ifdef BETA

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

406

data *= beta;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

407

#endif /* BETA */

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

408

#ifdef LOG_SOFTMAX

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

409

VSTORE(VECTOR_SIZE)

410

(data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

411

data = exp(data);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

412

#else /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

413

data = exp(data);

414

VSTORE(VECTOR_SIZE)

415

(data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

416

#endif /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

417

sum1D += data;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

418

}

419

#ifdef NON_MULTIPLE_OF_GRID_SIZE

420

//TODO: Optimize the calculation (avoid %).

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

421

boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

422

if(lid < boundary_workitems)

423

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

424

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

425

data -= max_val;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

426

#ifdef BETA

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

427

data *= beta;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

428

#endif /* BETA */

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

429

#ifdef LOG_SOFTMAX

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

430

VSTORE(VECTOR_SIZE)

431

(data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

432

data = exp(data);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

433

#else /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

434

data = exp(data);

435

VSTORE(VECTOR_SIZE)

436

(data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

437

#endif /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

438

sum1D += data;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

439

}

440

#ifdef NON_MULTIPLE_OF_VECTOR_SIZE

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

441

if(lid == 0)

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

442

{

443

// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

444

VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

445

data -= max_val;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

446

#ifdef BETA

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

447

data *= beta;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

448

#endif /* BETA */

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

449

#ifdef LOG_SOFTMAX

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

450

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

451

(data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

452

data = exp(data);

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

453

data = select(0, data, widx);

454

#else /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

455

data = exp(data);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

456

data = select(0, data, widx);

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

457

VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)

458

(data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));

Sang-Hoon Park

2019-10-29 13:13:19 +0000

[diff] [blame]

459

#endif /* LOG_SOFTMAX */

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

460

sum1D += data;

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

461

}

462

#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */

463

#endif /* NON_MULTIPLE_OF_GRID_SIZE */

464

tmp_local[lid] = sum1D;

465

466

barrier(CLK_LOCAL_MEM_FENCE);

if(GRID_SIZE >= 256)

{

if(lid < 128)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

472

tmp_local[lid] += tmp_local[lid + 128];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

473

}

474

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 128)

{

if(lid < 64)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

480

tmp_local[lid] += tmp_local[lid + 64];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

481

}

482

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 64)

{

if(lid < 32)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

488

tmp_local[lid] += tmp_local[lid + 32];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

489

}

490

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 32)

{

if(lid < 16)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

496

tmp_local[lid] += tmp_local[lid + 16];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

497

}

498

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 16)

{

if(lid < 8)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

504

tmp_local[lid] += tmp_local[lid + 8];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

505

}

506

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 8)

{

if(lid < 4)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

512

tmp_local[lid] += tmp_local[lid + 4];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

513

}

514

barrier(CLK_LOCAL_MEM_FENCE);

}

if(GRID_SIZE >= 4)

{

if(lid < 2)

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

520

tmp_local[lid] += tmp_local[lid + 2];

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

521

}

522

barrier(CLK_LOCAL_MEM_FENCE);

523

}

524

if(lid == 0)

525

{

Giorgio Arena

2020-10-26 15:04:08 +0000

[diff] [blame]

526

sum1D = (tmp_local[lid + 1] + tmp_local[lid]);

527

// Perform sum reduction

528

*((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);

Chunosov

2017-11-06 22:09:45 +0700

[diff] [blame]

529

}

530

}

Giorgio Arena