Blame - src/core/CL/cl_kernels/softmax_layer_quantized.cl - ml/ComputeLibrary

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

109

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

110

*

111

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8

112

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

113

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

114

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

115

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

116

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

117

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

118

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

119

* @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr

120

* @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)

121

* @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)

122

* @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)

123

* @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)

124

* @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)

125

* @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)

126

* @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor

127

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32

128

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

129

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

130

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

131

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

132

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

133

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

134

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

135

* @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr

136

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

137

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

138

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

139

* @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)

140

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

141

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

142

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

143

* @param[in] width Input image width

144

*/

145

__kernel void softmax_layer_shift_exp_sum_quantized(

146

TENSOR3D_DECLARATION(src),

147

TENSOR3D_DECLARATION(max),

148

TENSOR3D_DECLARATION(dst),

149

TENSOR3D_DECLARATION(sum),

150

uint width)

151

{

152

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

153

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

154

Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);

155

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);

156

157

// Load max value of 1D logits vector (row)

158

int max_val = convert_int(*((__global uchar *)offset(&max, 0, 0)));

159

160

// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)

161

int16 sum1D = 0;

162

163

// Shift values, exp and sum

164

const uint width4 = width >> 4;

165

for(uint i = 0; i < width4; i++)

166

{

167

uchar16 data = vload16(0, (__global uchar *)offset(&src, i << 4, 0));

168

int16 data_fp = convert_int16(data);

169

int16 data_diff = data_fp - max_val;

170

int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);

171

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

172

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

173

vstore16(data_diff, 0, (__global int *)offset(&dst, i << 4, 0));

174

sum1D = sum1D + select(0, data_fp, data_diff >= (int16)(DIFF_MIN));

175

}

176

177

#ifdef NON_MULTIPLE_OF_16

178

// Handle non multiple of 16

179

uchar16 data = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));

180

int16 data_fp = convert_int16(data);

181

int16 data_diff = data_fp - max_val;

182

int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);

183

data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

184

data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);

185

int16 widx = convert_int16(((uint16)(width4 << 4) + idx16) < width);

186

vstore16(data_diff, 0, (__global int *)offset(&dst, width4 << 4, 0));

187

data_fp = select(0, data_fp, data_diff >= (int16)(DIFF_MIN));

188

sum1D = sum1D + select(0, data_fp, widx);

189

#endif /* NON_MULTIPLE_OF_16 */

190

191

// Perform min/max reduction

192

sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, qs16, 8);

193

sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, qs16, 4);

194

sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, qs16, 2);

195

sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, qs16, 1);

196

197

// Calculate and store result

198

*((__global int *)sum.ptr) = sum1D.s0;

199

}

200

201

/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

202

*

203

* @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4

204

* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)

205

* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.

206

*

207

* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32

208

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

209

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

210

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

211

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

212

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

213

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

214

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

215

* @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr

216

* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)

217

* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)

219

* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)

221

* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)

222

* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor

223

* @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8

224

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

225

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

226

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

227

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

228

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

229

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

230

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

231

*/

232

__kernel void softmax_layer_norm_quantized(

233

TENSOR3D_DECLARATION(src),

234

TENSOR3D_DECLARATION(sum),

235

TENSOR3D_DECLARATION(dst))

236

{

237

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

238

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

239

Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);

240

241

// Load max value of 1D logits vector (row)

242

int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));

243

244

// It will be better to calculate this in prev layer and pass here as parameter

245

uint sum_val_u = convert_uint(sum_val);

246

int headroom_plus_one = clz(sum_val_u);

247

int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;

248

int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));

249

int16 shifted_sum_minus_one = shifted_sum_minus_one_1;

250

int16 shifted_scale = asymm_one_over_one_plus_x_for_x_in_0_1(shifted_sum_minus_one);

251

252

// It was already calculated in prev layer, should be stored into tmp output and reused

253

int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));

254

int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);

255

int16 data = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);

256

257

data = asymm_mult(shifted_scale, data);

258

data = asymm_rounding_divide_by_pow2(data, num_bits_over_unit + 31 - 8);

259

data = select(0, max(min(data, 255), 0), data_diff >= (int16)(DIFF_MIN));

260

vstore16(convert_uchar16(data), 0, (__global uchar *)offset(&dst, 0, 0));

261

}

262

263

#endif /* defined(DIFF_MIN) */