Blame - src/cpu/kernels/elementwise_binary/generic/neon/impl.h - ml/ComputeLibrary

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H

25

#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H

26

27

#include "src/core/NEON/NEAsymm.h"

28

29

namespace arm_compute

{

namespace cpu

{

template <ArithmeticOperation op, typename VectorType>

34

typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)

35

{

36

using vec_type = typename VectorType::type;

37

using scalar_type = typename VectorType::scalar_type;

38

using tag_type = typename VectorType::tag_type;

39

40

vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});

41

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

42

switch (op)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

43

{

44

case ArithmeticOperation::MAX:

45

res = wrapper::vmax(a, b);

46

break;

47

case ArithmeticOperation::MIN:

48

res = wrapper::vmin(a, b);

49

break;

50

case ArithmeticOperation::SQUARED_DIFF:

51

{

52

const vec_type tmp = wrapper::vsub(a, b);

53

res = wrapper::vmul(tmp, tmp);

54

break;

55

}

56

case ArithmeticOperation::PRELU:

57

{

58

const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});

59

const vec_type tmp = wrapper::vmul(a, b);

60

const auto gt = wrapper::vcgt(a, zero);

61

62

res = wrapper::vbsl(gt, a, tmp);

break;

}

default:

ARM_COMPUTE_ERROR("NOT_SUPPORTED!");

}

return res;

}

Dana Zlotnik

a538ae5

2022-02-21 13:12:41 +0200

[diff] [blame]

72

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

73

template <ArithmeticOperation op, typename ScalarType, typename VectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

74

typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,

75

const ScalarType &broadcast_value,

76

const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

77

{

78

using tag_type = typename VectorType::tag_type;

79

using vec_type = typename VectorType::type;

80

81

vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});

82

return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);

83

}

84

85

template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

void elementwise_op(

const ITensor *in1,

const ITensor *in2,

ITensor *out,

const Window &window,

91

OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),

92

int (*broadcast_func)(

93

int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),

94

int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

95

{

96

// Create input windows

97

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

98

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

99

100

// Clear X Dimension on execution window as we handle manually

101

Window win = window;

102

win.set(Window::DimX, Window::Dimension(0, 1, 1));

103

104

const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);

105

const auto window_start_x = static_cast<int>(window.x().start());

106

const auto window_end_x = static_cast<int>(window.x().end());

107

const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();

108

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

109

if (is_broadcast_across_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

110

{

111

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

112

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

113

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

114

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

115

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

116

117

// Clear X Dimension on execution window as we handle manually

118

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

119

120

Iterator broadcast_input(broadcast_tensor, broadcast_win);

121

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

122

Iterator output(out, win);

123

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

124

execute_window_loop(

125

win,

126

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

127

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

128

auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());

129

const auto non_broadcast_input_ptr =

130

reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());

131

const InputScalarType broadcast_value =

132

*reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());

133

134

int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,

135

broadcast_value, output_ptr, !is_broadcast_input_2);

136

for (; x < window_end_x; ++x)

137

{

138

const auto a = *(non_broadcast_input_ptr + x);

139

*(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,

140

!is_broadcast_input_2 ? a : broadcast_value);

141

}

142

},

143

broadcast_input, non_broadcast_input, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

148

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

149

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

150

151

Iterator input1(in1, input1_win);

152

Iterator input2(in2, input2_win);

153

Iterator output(out, win);

154

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

155

execute_window_loop(

156

win,

157

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

158

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

159

auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());

160

const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());

161

const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());

162

163

int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);

164

for (; x < window_end_x; ++x)

165

{

166

const auto a = *(input1_ptr + x);

167

const auto b = *(input2_ptr + x);

168

*(output_ptr + x) = (*scalar_func)(a, b);

169

}

170

},

171

input1, input2, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

}

template <ArithmeticOperation op, typename ScalarType>

176

inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)

177

{

178

auto res = ScalarType(0);

179

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

180

switch (op)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

181

{

182

case ArithmeticOperation::MAX:

183

res = std::max(a, b);

184

break;

185

case ArithmeticOperation::MIN:

186

res = std::min(a, b);

187

break;

188

case ArithmeticOperation::SQUARED_DIFF:

189

{

190

res = (a - b) * (a - b);

191

break;

192

}

193

case ArithmeticOperation::PRELU:

194

{

195

res = (a > 0 ? a : a * b);

196

break;

197

}

198

case ArithmeticOperation::DIV:

199

{

200

res = a / b;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

201

if (std::is_integral<ScalarType>::value)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

202

{

203

res = (b == 0) ? 0 : res;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

204

if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

{

--res;

}

}

break;

}

case ArithmeticOperation::POWER:

212

{

213

res = std::pow(a, b);

break;

}

default:

ARM_COMPUTE_ERROR("NOT_SUPPORTED!");

}

return res;

}

template <>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

223

inline int32x4_t

224

elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,

225

const int32x4_t &b)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

226

{

227

return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));

228

}

229

230

template <>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

231

inline float32x4_t

232

elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,

233

const float32x4_t &b)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

234

{

235

return wrapper::vdiv(a, b);

236

}

237

238

template <>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

239

inline float32x4_t

240

elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,

241

const float32x4_t &b)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

242

{

243

return wrapper::vpow(a, b);

244

}

245

246

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

247

template <>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

248

inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(

249

const float16x8_t &a, const float16x8_t &b)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

250

{

251

return wrapper::vdiv(a, b);

252

}

253

254

template <>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

255

inline float16x8_t

256

elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(

257

const float16x8_t &a, const float16x8_t &b)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

258

{

259

return wrapper::vpow(a, b);

260

}

261

#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

262

263

template <ArithmeticOperation op, typename ScalarType, typename VectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

264

inline int elementwise_arithm_op_loop(int window_start_x,

265

int window_end_x,

266

int window_step_x,

267

const ScalarType *input1_ptr,

268

const ScalarType *input2_ptr,

269

ScalarType *output_ptr)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

270

{

271

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

272

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

273

{

274

const auto a = wrapper::vloadq(input1_ptr + x);

275

const auto b = wrapper::vloadq(input2_ptr + x);

276

wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));

}

return x;

}

template <ArithmeticOperation op, typename ScalarType, typename VectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

282

inline int elementwise_arithm_op_broadcast_loop(int window_start_x,

283

int window_end_x,

284

int window_step_x,

285

const ScalarType *non_broadcast_input_ptr,

286

const ScalarType &broadcast_value,

287

ScalarType *output_ptr,

288

const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

289

{

290

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

291

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

292

{

293

const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

294

wrapper::vstore(output_ptr + x,

295

elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

return x;

}

template <ArithmeticOperation op, typename VectorType>

301

void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

302

{

303

using scalar_type = typename VectorType::scalar_type;

304

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

305

elementwise_op<scalar_type, scalar_type, VectorType>(

306

in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,

307

&elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,

308

&elementwise_arithm_op_loop<op, scalar_type, VectorType>);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

309

}

310

311

template <ComparisonOperation op, typename InputScalarType>

312

inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)

313

{

314

bool res = false;

315

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

316

switch (op)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

317

{

318

case ComparisonOperation::Equal:

319

res = (a == b);

320

break;

321

case ComparisonOperation::NotEqual:

322

res = (a != b);

323

break;

324

case ComparisonOperation::Greater:

325

res = (a > b);

326

break;

327

case ComparisonOperation::GreaterEqual:

328

res = (a >= b);

329

break;

330

case ComparisonOperation::Less:

331

res = (a < b);

332

break;

333

case ComparisonOperation::LessEqual:

res = (a <= b);

break;

default:

ARM_COMPUTE_ERROR("NOT_SUPPORTED!");

338

}

339

return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);

340

}

341

342

template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>

343

inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)

344

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

345

OutputVectorType res = {0, 0, 0, 0};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

346

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

347

switch (op)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

348

{

349

case ComparisonOperation::Equal:

350

res = wrapper::vceq(a, b);

351

break;

352

case ComparisonOperation::NotEqual:

353

res = wrapper::vnot(wrapper::vceq(a, b));

354

break;

355

case ComparisonOperation::Greater:

356

res = wrapper::vcgt(a, b);

357

break;

358

case ComparisonOperation::GreaterEqual:

359

res = wrapper::vcge(a, b);

360

break;

361

case ComparisonOperation::Less:

362

res = wrapper::vcgt(b, a);

363

break;

364

case ComparisonOperation::LessEqual:

365

res = wrapper::vcge(b, a);

366

break;

367

default:

368

ARM_COMPUTE_ERROR("NOT_SUPPORTED!");

}

return res;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

375

inline OutputVectorType

376

elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

377

{

378

InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

379

return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,

380

reorder ? a : broadcast_vector);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

381

}

382

383

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

384

inline int elementwise_comp_op_broadcast_8_loop(int window_start_x,

385

int window_end_x,

386

int window_step_x,

387

const InputScalarType *non_broadcast_input_ptr,

388

const InputScalarType &broadcast_value,

389

uint8_t *output_ptr,

390

const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

391

{

392

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

393

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

394

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

395

const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(

396

wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

397

wrapper::vstore(output_ptr + x, a);

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

403

inline int elementwise_comp_op_broadcast_16_loop(int window_start_x,

404

int window_end_x,

405

int window_step_x,

406

const InputScalarType *non_broadcast_input_ptr,

407

const InputScalarType &broadcast_value,

408

uint8_t *output_ptr,

409

const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

410

{

411

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

412

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

413

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

414

const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(

415

wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

416

wrapper::vstore(output_ptr + x, wrapper::vmovn(a));

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

422

inline int elementwise_comp_op_broadcast_32_loop(int window_start_x,

423

int window_end_x,

424

int window_step_x,

425

const InputScalarType *non_broadcast_input_ptr,

426

const InputScalarType &broadcast_value,

427

uint8_t *output_ptr,

428

const bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

429

{

430

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

431

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

432

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

433

const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(

434

wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);

435

const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(

436

wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

437

wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));

438

}

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

439

if (x <= window_end_x - 4)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

440

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

441

const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(

442

wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);

443

for (int i = 0; i < 4; i++)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

444

{

445

*(output_ptr + x + i) = wrapper::vgetlane(a, i);

}

x = +4;

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

453

inline int elementwise_comp_op_8_loop(int window_start_x,

454

int window_end_x,

455

int window_step_x,

456

const InputScalarType *input1_ptr,

457

const InputScalarType *input2_ptr,

458

uint8_t *output_ptr)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

459

{

460

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

461

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

462

{

463

const auto a = wrapper::vloadq(input1_ptr + x);

464

const auto b = wrapper::vloadq(input2_ptr + x);

465

const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);

466

wrapper::vstore(output_ptr + x, res);

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

472

inline int elementwise_comp_op_16_loop(int window_start_x,

473

int window_end_x,

474

int window_step_x,

475

const InputScalarType *input1_ptr,

476

const InputScalarType *input2_ptr,

477

uint8_t *output_ptr)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

478

{

479

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

480

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

481

{

482

const auto a = wrapper::vloadq(input1_ptr + x);

483

const auto b = wrapper::vloadq(input2_ptr + x);

484

const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);

485

wrapper::vstore(output_ptr + x, wrapper::vmovn(res));

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

491

inline int elementwise_comp_op_32_loop(int window_start_x,

492

int window_end_x,

493

int window_step_x,

494

const InputScalarType *input1_ptr,

495

const InputScalarType *input2_ptr,

496

uint8_t *output_ptr)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

497

{

498

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

499

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

500

{

501

auto a = wrapper::vloadq(input1_ptr + x);

502

auto b = wrapper::vloadq(input2_ptr + x);

503

const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);

504

a = wrapper::vloadq(input1_ptr + x + 4);

505

b = wrapper::vloadq(input2_ptr + x + 4);

506

const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);

507

wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));

508

}

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

509

if (x <= window_end_x - 4)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

510

{

511

const auto a = wrapper::vloadq(input1_ptr + x);

512

const auto b = wrapper::vloadq(input2_ptr + x);

513

const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

514

for (int i = 0; i < 4; i++)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

515

{

516

*(output_ptr + x + i) = wrapper::vgetlane(res, i);

}

x = +4;

}

return x;

}

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

524

void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

525

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

526

elementwise_op<InputScalarType, uint8_t, InputVectorType>(

527

in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,

528

&elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,

529

&elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

530

}

531

532

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

533

void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

534

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

535

elementwise_op<InputScalarType, uint8_t, InputVectorType>(

536

in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,

537

&elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,

538

&elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

539

}

540

541

template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>

542

void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

543

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

544

elementwise_op<InputScalarType, uint8_t, InputVectorType>(

545

in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,

546

&elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,

547

&elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

548

}

549

550

inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

551

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

552

qasymm8x16_t x = vld1q_u8(input1_ptr);

553

const float32x4x4_t out = {{

554

vmulq_f32(

555

vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),

556

scale),

557

vmulq_f32(

558

vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),

559

scale),

560

vmulq_f32(

561

vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),

562

scale),

563

vmulq_f32(vcvtq_f32_s32(

564

vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),

565

scale),

566

}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

return out;

}

inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

571

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

572

qasymm8x16_signed_t x = vld1q_s8(input1_ptr);

573

const float32x4x4_t out = {{

574

vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),

575

vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),

576

vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),

577

vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),

578

}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

return out;

}

inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)

583

{

584

const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));

585

const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));

586

vst1q_u8(output_ptr, vcombine_u8(pa, pb));

587

}

588

589

inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)

590

{

591

const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));

592

const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));

593

vst1q_u8(output_ptr, vcombine_u8(pa, pb));

594

}

595

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

596

inline void

597

store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

598

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

599

int32x4x4_t out = {{

600

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),

601

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),

602

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),

603

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),

604

}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

605

store_quantized(output_ptr, out);

606

}

607

608

inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)

609

{

610

const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));

611

const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));

612

vst1q_s8(output_ptr, vcombine_s8(pa, pb));

613

}

614

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

615

inline void store_quantized_signed(int8_t *output_ptr,

616

const float32x4x4_t &rf,

617

const float32x4_t &offset,

618

const float32x4_t &invscale)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

619

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

620

int32x4x4_t out = {{

621

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),

622

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),

623

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),

624

vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),

625

}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

626

store_quantized_signed(output_ptr, out);

627

}

628

629

template <ArithmeticOperation op>

630

inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)

631

{

632

return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);

633

}

634

635

template <ArithmeticOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

636

inline int8_t

637

elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

638

{

639

return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);

640

}

641

642

template <ArithmeticOperation op>

643

float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)

644

{

645

using neon_vector_float = wrapper::traits::neon_vector<float, 4>;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

646

float32x4x4_t out = {{

647

elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),

648

elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),

649

elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),

650

elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),

651

}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

return out;

}

template <ComparisonOperation op>

656

inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)

657

{

658

ARM_COMPUTE_UNUSED(qinfo);

659

return elementwise_comp_op_scalar<op>(a, b);

660

}

661

662

template <ComparisonOperation op>

663

inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)

664

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

665

uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),

666

elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),

667

elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),

668

elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

return out;

}

template <ArithmeticOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

673

inline int elementwise_arithm_op_quantized_loop(int window_start_x,

674

int window_end_x,

675

int window_step_x,

676

const uint8_t *input1_ptr,

677

const uint8_t *input2_ptr,

uint8_t *output_ptr,

int32x4_t voffset1,

int32x4_t voffset2,

float32x4_t vscale1,

float32x4_t vscale2,

float32x4_t voffseto,

684

float32x4_t invvscaleo)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

685

{

686

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

687

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

688

{

689

// Get inputs and compute output

690

const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);

691

const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);

692

const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);

693

store_quantized(output_ptr + x, rf, voffseto, invvscaleo);

}

return x;

}

template <ArithmeticOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

699

inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x,

700

int window_end_x,

701

int window_step_x,

702

const int8_t *input1_ptr,

703

const int8_t *input2_ptr,

int8_t *output_ptr,

int32x4_t voffset1,

int32x4_t voffset2,

float32x4_t vscale1,

float32x4_t vscale2,

float32x4_t voffseto,

710

float32x4_t invvscaleo)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

711

{

712

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

713

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

714

{

715

// Get inputs and compute output

716

const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);

717

const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);

718

const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);

719

store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);

}

return x;

}

template <ArithmeticOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

725

inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x,

726

int window_end_x,

727

int window_step_x,

728

const uint8_t *non_broadcast_input_ptr,

729

float32x4x4_t broadcast_vector,

730

uint8_t *output_ptr,

731

int32x4_t voffset_non_broadcast,

732

float32x4_t vscale_non_broadcast,

733

float32x4_t voffseto,

734

float32x4_t invvscaleo,

735

bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

736

{

737

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

738

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

739

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

740

const float32x4x4_t af =

741

load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);

742

const float32x4x4_t rf =

743

elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

744

store_quantized(output_ptr + x, rf, voffseto, invvscaleo);

}

return x;

}

template <ArithmeticOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

749

inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x,

750

int window_end_x,

751

int window_step_x,

752

const int8_t *non_broadcast_input_ptr,

753

float32x4x4_t broadcast_vector,

754

int8_t *output_ptr,

755

int32x4_t voffset_non_broadcast,

756

float32x4_t vscale_non_broadcast,

757

float32x4_t voffseto,

758

float32x4_t invvscaleo,

759

bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

760

{

761

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

762

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

763

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

764

const float32x4x4_t af =

765

load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);

766

const float32x4x4_t rf =

767

elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

768

store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);

}

return x;

}

template <ComparisonOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

774

inline int elementwise_comp_op_quantized_loop(int window_start_x,

775

int window_end_x,

776

int window_step_x,

777

const uint8_t *input1_ptr,

778

const uint8_t *input2_ptr,

uint8_t *output_ptr,

int32x4_t voffset1,

int32x4_t voffset2,

float32x4_t vscale1,

float32x4_t vscale2,

float32x4_t voffseto,

785

float32x4_t invvscaleo)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

786

{

787

ARM_COMPUTE_UNUSED(voffseto, invvscaleo);

788

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

789

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

790

{

791

const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);

792

const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);

793

const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);

794

store_quantized(output_ptr + x, rf);

}

return x;

}

template <ComparisonOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

800

inline int elementwise_comp_op_quantized_signed_loop(int window_start_x,

801

int window_end_x,

802

int window_step_x,

803

const int8_t *input1_ptr,

804

const int8_t *input2_ptr,

uint8_t *output_ptr,

int32x4_t voffset1,

int32x4_t voffset2,

float32x4_t vscale1,

float32x4_t vscale2,

float32x4_t voffseto,

811

float32x4_t invvscaleo)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

812

{

813

ARM_COMPUTE_UNUSED(voffseto, invvscaleo);

814

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

815

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

816

{

817

const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);

818

const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);

819

const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);

820

store_quantized(output_ptr + x, rf);

}

return x;

}

template <ComparisonOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

826

inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x,

827

int window_end_x,

828

int window_step_x,

829

const uint8_t *non_broadcast_input_ptr,

830

float32x4x4_t broadcast_vector,

831

uint8_t *output_ptr,

832

int32x4_t voffset_non_broadcast,

833

float32x4_t vscale_non_broadcast,

834

float32x4_t voffseto,

835

float32x4_t invvscaleo,

836

bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

837

{

838

ARM_COMPUTE_UNUSED(voffseto, invvscaleo);

839

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

840

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

841

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

842

const float32x4x4_t af =

843

load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);

844

const uint32x4x4_t rf =

845

elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

846

store_quantized(output_ptr + x, rf);

}

return x;

}

template <ComparisonOperation op>

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

852

inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x,

853

int window_end_x,

854

int window_step_x,

855

const int8_t *non_broadcast_input_ptr,

856

float32x4x4_t broadcast_vector,

857

uint8_t *output_ptr,

858

int32x4_t voffset_non_broadcast,

859

float32x4_t vscale_non_broadcast,

860

float32x4_t voffseto,

861

float32x4_t invvscaleo,

862

bool reorder)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

863

{

864

ARM_COMPUTE_UNUSED(voffseto, invvscaleo);

865

int x = window_start_x;

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

866

for (; x <= (window_end_x - window_step_x); x += window_step_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

867

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

868

const float32x4x4_t af =

869

load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);

870

const uint32x4x4_t rf =

871

elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

872

store_quantized(output_ptr + x, rf);

}

return x;

}

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

877

inline void elementwise_op_quantized(const ITensor *in1,

878

const ITensor *in2,

879

ITensor *out,

880

const Window &window,

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

881

uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

882

int (*broadcast_func)(int,

int,

int,

const uint8_t *,

float32x4x4_t,

uint8_t *,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

const bool),

int (*neon_func)(int,

int,

int,

const uint8_t *,

const uint8_t *,

uint8_t *,

int32x4_t,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

float32x4_t))

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

905

{

906

// Create input windows

907

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

908

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

909

910

// Clear X Dimension on execution window as we handle manually

911

Window win = window;

912

win.set(Window::DimX, Window::Dimension(0, 1, 1));

913

914

const int window_step_x = 16;

915

const auto window_start_x = static_cast<int>(window.x().start());

916

const auto window_end_x = static_cast<int>(window.x().end());

917

const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();

918

919

const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();

920

921

// Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)

922

const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);

923

const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);

924

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

925

if (is_broadcast_across_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

926

{

927

// Select the broadcast input on the X axis

928

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

929

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

930

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

931

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

932

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

933

934

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

935

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

936

937

const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);

938

const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);

939

940

// Clear X Dimension on execution window as we handle manually

941

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

942

943

Iterator broadcast_input(broadcast_tensor, broadcast_win);

944

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

945

Iterator output(out, win);

946

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

947

execute_window_loop(

948

win,

949

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

950

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

951

const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());

952

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

953

954

const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());

955

const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);

956

957

int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,

958

broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,

959

voffseto, invvscaleo, !is_broadcast_input_2);

960

for (; x < window_end_x; ++x)

961

{

962

const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);

963

const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);

964

*(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,

965

!is_broadcast_input_2 ? afs : bfs, output_qinfo);

966

}

967

},

968

broadcast_input, non_broadcast_input, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

else

{

const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();

973

const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();

974

975

// Input1 quantization info

976

const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);

977

const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);

978

979

// Input2 quantization info

980

const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);

981

const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);

982

983

// Clear X Dimension on execution window as we handle manually

984

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

985

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

986

987

Iterator input1(in1, input1_win);

988

Iterator input2(in2, input2_win);

989

Iterator output(out, win);

990

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

991

execute_window_loop(

992

win,

993

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

994

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

995

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

996

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

997

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

998

999

int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,

1000

voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);

1001

for (; x < window_end_x; ++x)

1002

{

1003

const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);

1004

const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);

1005

*(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);

1006

}

1007

},

1008

input1, input2, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1009

}

1010

}

1011

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1012

inline void

1013

elementwise_comp_quantized_signed(const ITensor *in1,

1014

const ITensor *in2,

1015

ITensor *out,

1016

const Window &window,

1017

uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),

1018

int (*broadcast_func)(int,

int,

int,

const int8_t *,

float32x4x4_t,

uint8_t *,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

const bool),

int (*neon_func)(int,

int,

int,

const int8_t *,

const int8_t *,

uint8_t *,

int32x4_t,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

float32x4_t))

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1041

{

1042

// Create input windows

1043

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

1044

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

1045

1046

// Clear X Dimension on execution window as we handle manually

1047

Window win = window;

1048

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1049

1050

const int window_step_x = 16;

1051

const auto window_start_x = static_cast<int>(window.x().start());

1052

const auto window_end_x = static_cast<int>(window.x().end());

1053

const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();

1054

1055

const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();

1056

1057

const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);

1058

const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);

1059

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1060

if (is_broadcast_across_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1061

{

1062

// Select the broadcast input on the X axis

1063

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1064

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1065

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

1066

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

1067

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

1068

1069

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

1070

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

1071

1072

const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);

1073

const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);

1074

1075

// Clear X Dimension on execution window as we handle manually

1076

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1077

1078

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1079

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

1080

Iterator output(out, win);

1081

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1082

execute_window_loop(

1083

win,

1084

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1085

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1086

const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());

1087

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

1088

1089

const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());

1090

const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);

1091

1092

int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,

1093

broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,

1094

voffseto, invvscaleo, !is_broadcast_input_2);

1095

for (; x < window_end_x; ++x)

1096

{

1097

const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);

1098

const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);

1099

*(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,

1100

!is_broadcast_input_2 ? afs : bfs, output_qinfo);

1101

}

1102

},

1103

broadcast_input, non_broadcast_input, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

else

{

const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();

1108

const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();

1109

1110

// Input1 quantization info

1111

const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);

1112

const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);

1113

1114

// Input2 quantization info

1115

const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);

1116

const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);

1117

1118

// Clear X Dimension on execution window as we handle manually

1119

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1120

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1121

1122

Iterator input1(in1, input1_win);

1123

Iterator input2(in2, input2_win);

1124

Iterator output(out, win);

1125

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1126

execute_window_loop(

1127

win,

1128

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1129

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1130

const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());

1131

const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());

1132

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

1133

1134

int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,

1135

voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);

1136

for (; x < window_end_x; ++x)

1137

{

1138

const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);

1139

const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);

1140

*(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);

1141

}

1142

},

1143

input1, input2, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1144

}

1145

}

1146

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1147

inline void

1148

elementwise_op_quantized_signed(const ITensor *in1,

1149

const ITensor *in2,

1150

ITensor *out,

1151

const Window &window,

1152

int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),

1153

int (*broadcast_func)(int,

int,

int,

const int8_t *,

float32x4x4_t,

int8_t *,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

const bool),

int (*neon_func)(int,

int,

int,

const int8_t *,

const int8_t *,

int8_t *,

int32x4_t,

int32x4_t,

float32x4_t,

float32x4_t,

float32x4_t,

float32x4_t))

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1176

{

1177

// Create input windows

1178

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

1179

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

1180

1181

// Clear X Dimension on execution window as we handle manually

1182

Window win = window;

1183

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1184

1185

const int window_step_x = 16;

1186

const auto window_start_x = static_cast<int>(window.x().start());

1187

const auto window_end_x = static_cast<int>(window.x().end());

1188

const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();

1189

1190

const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();

1191

1192

const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);

1193

const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);

1194

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1195

if (is_broadcast_across_x)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1196

{

1197

// Select the broadcast input on the X axis

1198

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1199

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1200

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

1201

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

1202

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

1203

1204

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

1205

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

1206

1207

const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);

1208

const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);

1209

1210

// Clear X Dimension on execution window as we handle manually

1211

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1212

1213

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1214

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

1215

Iterator output(out, win);

1216

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1217

execute_window_loop(

1218

win,

1219

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1220

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1221

const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());

1222

const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());

1223

1224

const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());

1225

const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);

1226

1227

int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,

1228

broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,

1229

voffseto, invvscaleo, !is_broadcast_input_2);

1230

for (; x < window_end_x; ++x)

1231

{

1232

const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);

1233

const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);

1234

*(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,

1235

!is_broadcast_input_2 ? afs : bfs, output_qinfo);

1236

}

1237

},

1238

broadcast_input, non_broadcast_input, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

else

{

const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();

1243

const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();

1244

1245

// Input1 quantization info

1246

const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);

1247

const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);

1248

1249

// Input2 quantization info

1250

const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);

1251

const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);

1252

1253

// Clear X Dimension on execution window as we handle manually

1254

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1255

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1256

1257

Iterator input1(in1, input1_win);

1258

Iterator input2(in2, input2_win);

1259

Iterator output(out, win);

1260

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1261

execute_window_loop(

1262

win,

1263

[&](const Coordinates &)

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

1264

{

Felix Thomasmathibalan

afd38f0

2023-09-27 17:46:17 +0100

[diff] [blame^]

1265

const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());

1266

const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());

1267

const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());

1268

1269

int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,

1270

voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);

1271

for (; x < window_end_x; ++x)

1272

{

1273

const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);

1274

const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);

1275

*(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);

1276

}

1277

},

1278

input1, input2, output);

Dana Zlotnik

d5c496d

2021-11-28 14:46:12 +0200

[diff] [blame]

}

}

template <ArithmeticOperation op>

1283

void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

1284

{

1285

elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,

1286

&elementwise_arithm_op_quantized_broadcast_loop<op>,

1287

&elementwise_arithm_op_quantized_loop<op>);

1288

}

1289

1290

template <ArithmeticOperation op>

1291

void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

1292

{

1293

elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,

1294

&elementwise_arithm_op_quantized_signed_broadcast_loop<op>,

1295

&elementwise_arithm_op_quantized_singed_loop<op>);

1296

}

1297

1298

template <ComparisonOperation op>

1299

void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

1300

{

1301

elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,

1302

&elementwise_comp_op_quantized_broadcast_loop<op>,

1303

&elementwise_comp_op_quantized_loop<op>);

1304

}

1305

1306

template <ComparisonOperation op>

1307

void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)

1308

{

1309

elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,

1310

&elementwise_comp_op_quantized_signed_broadcast_loop<op>,

1311

&elementwise_comp_op_quantized_signed_loop<op>);

1312

}

1313

} // namespace cpu

1314

} // namespace arm_compute

1315

1316

#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */