Blame - src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl - ml/ComputeLibrary

Sheri Zhang

b18252d

2020-04-07 11:04:57 +0100

[diff] [blame]

1

/*

Adnan AlSinan

7075fe2

2021-07-05 13:12:52 +0100

[diff] [blame]

2

Sheri Zhang

b18252d

2020-04-07 11:04:57 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#include "helpers_asymm.h"

25

26

#if VEC_SIZE == 2

27

#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)

28

#define PERFORM_REDUCTION_IMPL(type) \

29

inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \

{ \

sum.s0 += sum.s1; \

return sum.s0; \

}

#elif VEC_SIZE == 4

#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)

36

#define PERFORM_REDUCTION_IMPL(type) \

37

inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \

38

{ \

39

sum.s01 += sum.s23; \

sum.s0 += sum.s1; \

return sum.s0; \

}

#elif VEC_SIZE == 8

#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)

45

#define PERFORM_REDUCTION_IMPL(type) \

46

inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \

47

{ \

48

sum.s0123 += sum.s4567; \

49

sum.s01 += sum.s23; \

sum.s0 += sum.s1; \

return sum.s0; \

}

#else /* VEC_SIZE DEFAULT */

54

#define VEC_SIZE 16

55

#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)

56

#define PERFORM_REDUCTION_IMPL(type) \

57

inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \

58

{ \

59

sum.s01234567 += sum.s89abcdef; \

60

sum.s0123 += sum.s4567; \

61

sum.s01 += sum.s23; \

sum.s0 += sum.s1; \

return sum.s0; \

}

#endif /* VEC_SIZE END */

66

67

#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)

68

#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)

69

70

PERFORM_REDUCTION_IMPL(int)

71

PERFORM_REDUCTION_IMPL(long)

72

73

/** Compute quantized multiplier and shift for the inverse square root of input.

74

* Using 3-bit fixed point and 5 iteration of Newton-Raphson method.

75

*

76

* @param[in] in Input to use

77

* @param[in] reverse_shift -1 to reverse the shift direction

78

*

79

* @return:

80

* .s0 Quantized multiplier for inverse square root

81

* .s1 Shift for inverse square root

82

*

83

*/

84

inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)

85

{

86

int2 stddev_inv;

87

int stddev_inv_multiplier = INT_MAX;

88

int stddev_inv_shift = 0;

int input = in;

if(input <= 1)

{

stddev_inv.s0 = stddev_inv_multiplier;

93

stddev_inv.s1 = stddev_inv_shift;

return stddev_inv;

}

stddev_inv_shift = 11;

98

while(input >= (1 << 29))

{

input /= 4;

++stddev_inv_shift;

}

const unsigned int max_left_shift_bits = clz(input) - 1;

105

const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;

106

const unsigned int left_shift_bit_pairs = max_left_shift_bits_pairs - 1;

107

stddev_inv_shift -= left_shift_bit_pairs;

108

input <<= 2 * left_shift_bit_pairs;

109

110

typedef int FixedPointRawType;

111

const unsigned int fixedpoint_position = 3;

112

const unsigned int fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;

113

typedef FixedPointRawType FixedPoint3;

114

typedef FixedPointRawType FixedPoint0;

115

116

const FixedPoint3 fixedpoint_input = (input >> 1);

117

const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);

118

const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));

119

FixedPoint3 x = 0x1 << fixedpoint_int_position;

120

121

const int num_iteration = 5;

122

for(int i = 0; i < num_iteration; i++)

123

{

124

int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);

125

x = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);

126

}

127

const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;

128

x = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);

129

stddev_inv_multiplier = x;

130

if(stddev_inv_shift < 0)

131

{

132

stddev_inv_multiplier <<= -stddev_inv_shift;

133

stddev_inv_shift = 0;

134

}

135

stddev_inv_shift *= reverse_shift;

136

137

stddev_inv.s0 = stddev_inv_multiplier;

138

stddev_inv.s1 = stddev_inv_shift;

return stddev_inv;

}

#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)

143

/** This function implements QLSTM layer normalization.

144

*

145

* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16

146

* @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float

147

* @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16

148

*

149

* @param[in] input_ptr Pointer to the first source tensor. Supported data types: QSYMM16

150

* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)

151

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

152

* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)

153

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

154

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor

155

* @param[in] weight_ptr Pointer to the weight tensor. Supported data type: same as @p input_ptr

156

* @param[in] weight_stride_x Stride of the weight tensor in X dimension (in bytes)

157

* @param[in] weight_step_x weight_stride_x * number of elements along X processed per workitem(in bytes)

158

* @param[in] weight_offset_first_element_in_bytes The offset of the first element in the weight tensor

159

* @param[in] bias_ptr Pointer to the bias tensor. Supported data type: S32

160

* @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes)

161

* @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes)

162

* @param[in] bias_offset_first_element_in_bytes The offset of the first element in the biases tensor

163

* @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr

164

* @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)

165

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

166

* @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)

167

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

168

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor

169

*/

170

__kernel void qlstm_layer_normalization(

171

IMAGE_DECLARATION(input),

172

VECTOR_DECLARATION(weight),

173

VECTOR_DECLARATION(bias),

174

IMAGE_DECLARATION(output))

175

{

176

// Get pixels pointer

177

Image input = CONVERT_TO_IMAGE_STRUCT(input);

178

Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);

179

Vector bias = CONVERT_TO_VECTOR_STRUCT(bias);

180

Image output = CONVERT_TO_IMAGE_STRUCT(output);

181

182

VEC_DATA_TYPE(int, VEC_SIZE)

183

sum = 0;

184

VEC_DATA_TYPE(long, VEC_SIZE)

185

sum_sq = 0;

186

// Calculate partial sum

187

int i = 0;

188

for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)

189

{

190

// Load data

191

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

192

data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));

193

194

sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));

195

sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));

196

}

197

// Perform reduction

198

sum.s0 = PERFORM_REDUCTION(sum, int);

199

sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);

200

201

// Left-overs loop

202

for(; i < WIDTH; ++i)

203

{

204

DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));

205

206

sum.s0 += CONVERT(data, int);

207

sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);

208

}

209

210

int temp = 0x100000 / WIDTH;

211

int mean = (int)(sum.s0 * 1024 / WIDTH);

212

int var2 = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;

213

int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);

214

215

i = 0;

216

for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)

217

{

218

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

219

data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));

220

VEC_DATA_TYPE(int, VEC_SIZE)

221

res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;

222

res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);

223

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

224

w = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));

225

res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));

226

res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));

227

// Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;

228

res = (res + 512) >> 10;

229

res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);

230

#if defined(MIN_BOUND)

231

res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);

232

#endif // defined(MIN_BOUND)

233

#if defined(MAX_BOUND)

234

res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);

235

#endif // defined(MAX_BOUND)

236

VSTORE(VEC_SIZE)

237

(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));

238

}

239

for(; i < WIDTH; ++i)

240

{

241

DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));

242

int res = (int)data * 1024 - mean;

243

res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);

244

DATA_TYPE w = *((__global DATA_TYPE *)vector_offset(&weight, i));

245

res = res * (int)w;

246

int b = *((__global int *)vector_offset(&bias, i));

247

res = res + b;

248

// Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;

249

res = (res + 512) >> 10;

250

res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);

251

#if defined(MIN_BOUND)

252

res = max(res, MIN_BOUND);

253

#endif // defined(MIN_BOUND)

254

#if defined(MAX_BOUND)

255

res = min(res, MAX_BOUND);

256

#endif // defined(MAX_BOUND)

257

*((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;

258

}

259

}

260

#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */