Blame - src/cpu/kernels/softmax/generic/neon/impl.cpp - ml/ComputeLibrary

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

1

/*

Pablo Marquez Tello

7ce8a83

2023-08-31 16:00:50 +0100

[diff] [blame]

2

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#include "src/cpu/kernels/softmax/generic/neon/impl.h"

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

25

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

26

#include "support/SaturateCast.h"

27

28

namespace arm_compute

29

{

30

namespace cpu

31

{

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

32

template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);

33

template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);

34

35

template <typename T>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

36

void neon_softmax_logits_1d_quantized(

37

const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

38

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

39

static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

40

"quantized type should be either qasymm8_t or qasymm8_signed_t.");

41

42

const int start_x = in->info()->valid_region().anchor.x();

43

const int input_width = in->info()->valid_region().shape.x();

44

45

const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;

46

const auto scale_beta_vec = vdupq_n_f32(scale_beta);

47

48

Iterator in_it(in, window);

49

Iterator max_it(max, window);

50

Iterator out_it(out, window);

51

constexpr int vec_size = 16;

52

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

53

execute_window_loop(

54

window,

55

[&](const Coordinates &)

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

56

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

57

/* Get pointers */

58

const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;

59

const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;

60

const auto tmp_ptr = reinterpret_cast<float *>(tmp);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

61

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

62

float sum{};

63

float sum_inversed{};

64

65

/* Compute exponentials and sum */

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

66

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

67

/* Get max value */

68

const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());

69

const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

70

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

71

/* Init sum to zero */

72

float32x4x4_t vec_sum = {

vdupq_n_f32(0.f),

vdupq_n_f32(0.f),

vdupq_n_f32(0.f),

vdupq_n_f32(0.f),

};

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

78

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

79

/* Loop over row and compute exponentials and sum */

80

int x = 0;

81

for (; x <= (input_width - vec_size); x += vec_size)

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

82

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

83

auto vec_elements = wrapper::vloadq(in_ptr + x);

84

vec_elements = wrapper::vqsub(vec_max, vec_elements);

85

auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

86

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

87

if (is_log)

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

88

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

89

vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);

90

vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);

91

vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);

92

vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);

93

vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));

94

vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));

95

vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));

96

vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));

97

}

98

else

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

99

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

100

vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));

101

vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));

102

vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));

103

vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));

104

vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);

105

vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);

106

vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);

107

vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

108

}

109

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

110

vst4q_f32(tmp_ptr + x, vec_elements_flt);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

111

}

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

112

113

/* Reduce sum */

114

const auto sum_16_byte =

115

vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));

116

auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));

117

sum_res = vpadd_f32(sum_res, sum_res);

118

sum = wrapper::vgetlane(sum_res, 0);

119

120

/* Run remaining elements */

121

for (; x < input_width; ++x)

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

122

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

float element{};

if (is_log)

{

element = (max_val - in_ptr[x]) * scale_beta;

127

sum += std::exp(element);

}

else

{

element = std::exp((max_val - in_ptr[x]) * scale_beta);

sum += element;

}

tmp_ptr[x] = element;

}

if (!is_log)

{

sum_inversed = 256.f / sum;

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

141

}

142

else

143

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

144

sum = std::log(sum);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

145

}

146

}

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

147

148

/* Normalize exponentials */

149

{

150

constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;

151

/* Loop over row and compute softmax */

152

int x = 0;

153

for (; x <= (input_width - vec_size); x += vec_size)

154

{

155

using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;

156

float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);

157

int_vec_type normalized_value{};

158

if (is_log)

159

{

160

const float32x4x4_t sub = {

161

vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),

162

vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),

163

vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),

164

vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),

165

};

166

normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);

}

else

{

float32x4x4_t mul = {

171

vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),

172

vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),

173

vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),

174

vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),

175

};

176

177

if (is_qasymm8_signed)

178

{

179

const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});

180

mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);

181

mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);

182

mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);

183

mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);

184

}

185

186

normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);

187

}

188

wrapper::vstore(out_ptr + x, normalized_value);

189

}

190

/* Run remaining elements */

191

for (; x < input_width; ++x)

{

if (is_log)

{

out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);

}

else

{

out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -

200

(is_qasymm8_signed ? 128.f : 0));

}

}

}

},

in_it, max_it, out_it);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

206

}

207

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame]

208

template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,

const ITensor *max,

void *const tmp,

ITensor *out,

float beta,

bool is_log,

const Window &window);

215

template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,

const ITensor *max,

void *const tmp,

ITensor *out,

float beta,

bool is_log,

const Window &window);

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

222

} // namespace cpu

223

} // namespace arm_compute