Blame - src/cpu/kernels/softmax/generic/neon/impl.h - ml/ComputeLibrary

2021-01-06 17:40:30 +0000

[diff] [blame]

1

/*

Omar Al Khatib

2024-01-02 14:45:07 +0000

[diff] [blame^]

2

Michalis Spyrou

2021-01-06 17:40:30 +0000

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

24

#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H

25

#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H

Michalis Spyrou

2021-01-06 17:40:30 +0000

[diff] [blame]

26

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

27

#include "arm_compute/core/Helpers.h"

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

28

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

29

#include "src/core/NEON/NEMath.h"

30

#include "src/core/NEON/wrapper/wrapper.h"

Michalis Spyrou

2021-01-06 17:40:30 +0000

[diff] [blame]

31

32

namespace arm_compute

33

{

34

namespace cpu

35

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

36

37

#ifdef __aarch64__

38

namespace

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

39

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

40

// These helper functions are added because vaddv does not exist for fp16,

41

// and, therefore, is not part of the wrapper::vaddv interface.

42

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

43

inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)

44

{

45

auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));

46

for (int i = 0; i < sum_stages; ++i)

47

{

48

sum_res = wrapper::vpadd(sum_res, sum_res);

49

}

50

return wrapper::vgetlane(sum_res, 0);

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

51

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

52

#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Michalis Spyrou

2021-01-06 17:40:30 +0000

[diff] [blame]

53

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

54

inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

55

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

56

ARM_COMPUTE_UNUSED(sum_stages);

57

return wrapper::vaddv(a);

58

}

59

} // namespace

60

#endif // __aarch64__

61

62

// The template implementation for float data types is stored in the header file because

63

// we need all fp16 instantiated code to live in fp16.cpp files.

64

template <typename T, bool IS_LOG>

Omar Al Khatib

2024-01-02 14:45:07 +0000

[diff] [blame^]

65

void neon_softmax_x_float(const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

66

{

Omar Al Khatib

2024-01-02 14:45:07 +0000

[diff] [blame^]

67

ARM_COMPUTE_UNUSED(axis);

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

68

ARM_COMPUTE_UNUSED(tmp);

69

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

70

const int input_width = in->info()->valid_region().shape.x();

71

72

Iterator in_it(in, window);

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

73

Iterator out_it(out, window);

74

75

/** SIMD vector tag type. */

76

using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;

77

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

78

constexpr int vec_size = 16 / sizeof(T);

79

80

const int sum_stages = log2(vec_size >> 1);

81

82

const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

83

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

84

execute_window_loop(

85

window,

86

[&](const Coordinates &)

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

87

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

88

/* Get pointers */

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

89

const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr());

90

T *out_ptr = reinterpret_cast<T *>(out_it.ptr());

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

91

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

T max_val;

/* Compute Max */

{

// Init max value

auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});

98

int x = 0;

99

100

for (; x <= (input_width - vec_size); x += vec_size)

101

{

102

const auto current_value = wrapper::vloadq(in_ptr + x);

103

vec_max = wrapper::vmax(vec_max, current_value);

}

#ifdef __aarch64__

max_val = wrapper::vmaxv(vec_max);

108

#else // __aarch64__

109

auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));

110

111

for (int i = 0; i < sum_stages; ++i)

112

{

113

carry_max = wrapper::vpmax(carry_max, carry_max);

114

}

115

116

max_val = wrapper::vgetlane(carry_max, 0);

117

#endif // __aarch64__

118

119

// Compute left-over elements

120

for (; x < input_width; ++x)

121

{

122

max_val = std::max(*(in_ptr + x), max_val);

}

} // compute max

T sum_transformed{};

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

127

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

128

/* Compute exponentials and sum */

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

129

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

130

/* Get max value */

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

131

const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});

132

133

/* Init sum to zero */

134

auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});

135

136

/* Loop over row and compute exponentials and sum */

137

int x = 0;

138

for (; x <= (input_width - vec_size); x += vec_size)

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

139

{

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

140

auto vec_elements = wrapper::vloadq(in_ptr + x);

141

vec_elements = wrapper::vsub(vec_elements, vec_max);

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

142

if (IS_LOG)

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

143

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

144

vec_elements = wrapper::vmul(vec_elements, beta_vec);

145

vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

146

}

147

else

148

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

149

vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));

150

vec_sum = wrapper::vadd(vec_sum, vec_elements);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

151

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

152

wrapper::vstore(out_ptr + x, vec_elements);

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

153

}

154

155

/* Reduce sum */

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

156

T sum{};

157

#ifdef __aarch64__

158

sum = wrapper_vaddv(vec_sum, sum_stages);

159

#else // __aarch64__

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

160

auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));

161

for (int i = 0; i < sum_stages; ++i)

162

{

163

sum_res = wrapper::vpadd(sum_res, sum_res);

164

}

165

sum = wrapper::vgetlane(sum_res, 0);

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

166

#endif // __aarch64__

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

167

168

/* Run remaining elements */

169

for (; x < input_width; ++x)

{

T element{};

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

173

if (IS_LOG)

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

174

{

175

element = (in_ptr[x] - max_val) * beta;

176

sum += std::exp(element);

}

else

{

element = std::exp((in_ptr[x] - max_val) * beta);

181

sum += element;

182

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

183

184

out_ptr[x] = element;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

185

}

186

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

187

if (!IS_LOG)

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

188

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

189

sum_transformed = T(1) / sum;

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

190

}

191

else

192

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

193

sum_transformed = static_cast<T>(std::log(sum));

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

194

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

195

} // Compute exponentials and sum

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

196

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

197

/* Normalize exponentials */

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

198

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

199

const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});

200

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

201

/* Loop over row and compute softmax */

202

int x = 0;

203

for (; x <= (input_width - vec_size); x += vec_size)

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

204

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

205

const auto vec_in = wrapper::vloadq(out_ptr + x);

206

if (IS_LOG)

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

207

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

208

wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

209

}

210

else

211

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

212

wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

213

}

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

214

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

215

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

216

/* Run remaining elements */

217

for (; x < input_width; ++x)

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

218

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

219

if (IS_LOG)

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

220

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

221

out_ptr[x] = out_ptr[x] - sum_transformed;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

222

}

223

else

224

{

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

225

out_ptr[x] = out_ptr[x] * sum_transformed;

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

226

}

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

227

}

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

228

} // Normalize exponentials

Felix Thomasmathibalan

2023-09-27 17:46:17 +0100

[diff] [blame]

229

},

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

230

in_it, out_it);

Pablo Marquez Tello

2023-08-31 16:00:50 +0100

[diff] [blame]

231

}

Omar Al Khatib

2024-01-02 14:45:07 +0000

[diff] [blame^]

232

template <typename T, bool IS_LOG>

233

void neon_softmax_non_x_float(

234

const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window)

235

{

236

ARM_COMPUTE_UNUSED(tmp);

237

238

Iterator in_it(in, window);

239

Iterator out_it(out, window);

240

241

/** SIMD vector tag type. */

242

using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;

243

244

const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});

245

constexpr int vec_size = 16 / sizeof(T);

246

const ITensorInfo *in_info = in->info();

247

const ITensorInfo *out_info = out->info();

248

const int x_width = in_info->valid_region().shape.x();

249

const unsigned int in_axis_stride = in_info->strides_in_bytes()[axis];

250

const unsigned int out_axis_stride = out_info->strides_in_bytes()[axis];

251

const int axis_width = in_info->dimension(axis);

execute_window_loop(

window,

[&](const Coordinates &winCoords)

256

{

257

const bool vector_exceeds_bounds = (winCoords[0] + vec_size) > x_width;

258

259

/* Get pointers */

260

const uint8_t *in_ptr = in_it.ptr();

261

uint8_t *out_ptr = out_it.ptr();

262

263

// Init max value

264

auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});

/* Compute Max */

{

if (!vector_exceeds_bounds)

269

{

270

int i = 0;

271

for (; i < axis_width; ++i)

272

{

273

const auto current_value =

274

wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr));

275

vec_max = wrapper::vmax(vec_max, current_value);

}

}

else

{

int i = 0;

for (; i < axis_width; ++i)

282

{

283

const T *const base_ptr_in = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr);

284

int j = 0;

285

for (; j < (x_width - winCoords[0]); ++j)

286

{

287

const auto current_value = *(base_ptr_in + j);

288

vec_max[j] = std::max(vec_max[j], current_value);

}

}

}

} // compute max

auto vec_sum_transformed = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});

295

296

auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});

297

/* Init sum to zero */

298

auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});

299

300

/* Compute exponentials and sum */

301

{

302

if (!vector_exceeds_bounds)

303

{

304

const auto vec_one = wrapper::vdup_n(static_cast<T>(1), ExactTagType{});

305

/* Loop over row and compute exponentials and sum */

306

int i = 0;

307

for (; i < axis_width; ++i)

308

{

309

vec_elements = wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr));

310

vec_elements = wrapper::vsub(vec_elements, vec_max);

311

if (IS_LOG)

312

{

313

vec_elements = wrapper::vmul(vec_elements, beta_vec);

314

vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));

}

else

{

vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));

319

vec_sum = wrapper::vadd(vec_sum, vec_elements);

320

}

321

322

wrapper::vstore(reinterpret_cast<T *>((i * out_axis_stride) + out_ptr), vec_elements);

}

if (!IS_LOG)

{

vec_sum_transformed = wrapper::vdiv(vec_one, vec_sum);

}

else

{

vec_sum_transformed = wrapper::vlog(vec_sum);

}

}

else

{

int i = 0;

for (; i < axis_width; ++i)

338

{

339

const T *const base_ptr_in = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr);

340

T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);

341

int j = 0;

342

for (; j < (x_width - winCoords[0]); ++j)

343

{

344

vec_elements[j] = *(base_ptr_in + j);

345

vec_elements[j] -= vec_max[j];

346

if (IS_LOG)

347

{

348

vec_elements[j] *= beta;

349

vec_sum[j] += std::exp(vec_elements[j]);

}

else

{

vec_elements[j] = std::exp(vec_elements[j] * beta);

354

vec_sum[j] += vec_elements[j];

355

}

356

*(base_ptr_out + j) = vec_elements[j];

}

}

int j = 0;

for (; j < (x_width - winCoords[0]); ++j)

{

if (!IS_LOG)

{

vec_sum_transformed[j] = 1 / vec_sum[j];

}

else

{

vec_sum_transformed[j] = std::log(vec_sum[j]);

}

}

}

} // Compute exponentials and sum

373

374

/* Normalize exponentials */

375

{

376

if (!vector_exceeds_bounds)

377

{

378

/* Loop over row and compute softmax */

379

int i = 0;

380

for (; i < axis_width; ++i)

381

{

382

T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);

383

auto vec_in = wrapper::vloadq(base_ptr_out);

384

if (IS_LOG)

385

{

386

wrapper::vstore(base_ptr_out, wrapper::vsub(vec_in, vec_sum_transformed));

}

else

{

wrapper::vstore(base_ptr_out, wrapper::vmul(vec_in, vec_sum_transformed));

}

}

}

else

{

int i = 0;

for (; i < axis_width; ++i)

398

{

399

T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr);

400

int j = 0;

401

for (; j < (x_width - winCoords[0]); ++j)

{

if (IS_LOG)

{

*(base_ptr_out + j) -= vec_sum_transformed[j];

}

else

{

*(base_ptr_out + j) *= vec_sum_transformed[j];

}

}

}

}

} // Normalize exponentials

},

in_it, out_it);

}

template <typename T, bool IS_LOG>

419

void neon_softmax_x_quantized(

420

const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);

Gunes Bayir

2023-11-07 05:43:07 +0000

[diff] [blame]

421

422

template <typename T, bool IS_LOG>

Omar Al Khatib

2024-01-02 14:45:07 +0000

[diff] [blame^]

423

void neon_softmax_non_x_quantized(

424

const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window);

Michalis Spyrou

2021-01-06 17:40:30 +0000

[diff] [blame]

425

} // namespace cpu

426

} // namespace arm_compute

427

Gunes Bayir