Blame - src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp - ml/ComputeLibrary

2017-11-28 18:29:43 +0000

[diff] [blame]

47

{

48

ARM_COMPUTE_UNUSED(overflow_policy);

49

ARM_COMPUTE_UNUSED(rounding_policy);

50

Anthony Barbier

eaefd00

2018-07-20 17:49:35 +0100

[diff] [blame]

51

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

52

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);

53

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

54

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

55

DataType::S16, DataType::QSYMM16,

56

DataType::S32, DataType::F16, DataType::F32);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

57

if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

58

{

59

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

60

ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

61

}

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

62

63

if(output->total_size() > 0)

64

{

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

65

const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());

66

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");

67

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

68

69

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),

70

"Output can only be U8 if both inputs are U8");

Michele Di Giorgio

f9b595a

2020-07-03 13:34:52 +0100

[diff] [blame^]

71

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),

72

"Output can only be QASYMM8 if both inputs are QASYMM8");

73

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),

74

"Output can only be QASYMM8 if both inputs are QASYMM8");

75

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),

76

"Output can only be QSYMM16 if both inputs are QSYMM16");

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

77

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),

78

"Output can only be S32 if both inputs are QSYMM16");

79

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

80

}

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

81

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

82

if(std::abs(scale - scale255_constant) < 0.00001f)

83

{

84

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);

89

90

int exponent = 0;

91

const float normalized_mantissa = std::frexp(scale, &exponent);

92

93

// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15

94

// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14

95

// Moreover, it will be negative as we deal with 1/2^n

96

ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");

97

}

98

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

99

return Status{};

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

100

}

101

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

102

/* Scales a given vector by 1/255.

103

*

104

* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.

105

*

106

* @param in Input vector to scale.

107

* @return Scaled output rounded to nearest (round half up).

108

*/

109

inline int32x4_t scale255_S32_S32(int32x4_t in)

110

{

111

// Scale

112

const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);

113

// Round to nearest (round half up)

114

// Add +0.5 for all values

115

// Afterwards vcvt rounds toward zero

116

return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));

117

}

118

119

inline uint16x8_t scale255_U16_U16(uint16x8_t in)

120

{

121

const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));

122

const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));

123

return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));

124

}

125

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

126

template <typename T>

127

inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type

128

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

129

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

130

return vquantize_signed(val, info);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

131

}

132

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

133

template <typename T>

134

inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type

135

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

136

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

137

return vquantize(val, info);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

138

}

139

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

140

template <typename T>

141

inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type

142

quantize(float val, const UniformQuantizationInfo &info)

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

143

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

144

int32_t tmp = static_cast<int32_t>(val / info.scale) + info.offset;

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

145

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

146

T tmp_qua = static_cast<T>(tmp > SCHAR_MAX) ? SCHAR_MAX : ((tmp < SCHAR_MIN) ? SCHAR_MIN : tmp);

147

return tmp_qua;

148

}

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

149

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

150

template <typename T>

151

inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type

152

quantize(float val, const UniformQuantizationInfo &info)

153

{

154

int32_t tmp = static_cast<int32_t>(val / info.scale) + info.offset;

155

156

T tmp_qua = static_cast<T>((tmp > UCHAR_MAX) ? UCHAR_MAX : tmp);

return tmp_qua;

}

template <typename T>

161

inline float dequantize(const T *input, const UniformQuantizationInfo &info)

162

{

163

return static_cast<float>((*input) - info.offset) * info.scale;

164

}

165

166

template <typename T>

167

void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

168

{

169

const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();

170

const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();

171

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

172

173

// Create input windows

174

Window win = window;

175

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

176

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

177

178

// Clear X Dimension on execution window as we handle manually

179

win.set(Window::DimX, Window::Dimension(0, 1, 1));

180

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

181

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

182

183

Iterator input1(in1, input1_win);

184

Iterator input2(in2, input2_win);

185

Iterator output(out, win);

186

187

const int window_step_x = 16 / sizeof(T);

188

const auto window_start_x = static_cast<int>(window.x().start());

189

const auto window_end_x = static_cast<int>(window.x().end());

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

190

191

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

192

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

193

execute_window_loop(win, [&](const Coordinates &)

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

194

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

195

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

196

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

197

const auto output_ptr = reinterpret_cast<T *>(output.ptr());

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

198

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

199

// Compute window_step_x elements per iteration

200

int x = window_start_x;

201

for(; x <= (window_end_x - window_step_x); x += window_step_x)

202

{

203

const auto input1_q = wrapper::vloadq(input1_ptr + x);

204

const auto input2_q = wrapper::vloadq(input2_ptr + x);

205

206

// Dequantize inputs

207

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

208

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

209

210

const float32x4x4_t out_f32x4x4 =

211

{

212

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

213

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

214

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

215

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

};

// Quantize output

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

220

wrapper::vstore(output_ptr + x, result);

221

}

222

223

// Compute left-over elements

224

for(; x < window_end_x; ++x)

225

{

226

// Dequantize inputs

227

float tmp_in1 = dequantize(input1_ptr + x, input1_qua_info);

228

float tmp_in2 = dequantize(input2_ptr + x, input2_qua_info);

229

float tmp_f = tmp_in1 * tmp_in2;

230

231

// Quantize output

232

const auto tmp_qua = quantize<T>(tmp_f, tmp_qua_info);

233

*(output_ptr + x) = tmp_qua;

234

}

235

},

236

input1, input2, output);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

237

}

238

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

239

void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

240

{

241

const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();

242

const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();

243

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

244

245

// Create input windows

246

Window win = window;

247

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

248

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

249

250

// Clear X Dimension on execution window as we handle manually

251

win.set(Window::DimX, Window::Dimension(0, 1, 1));

252

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

253

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

254

255

Iterator input1(in1, input1_win);

256

Iterator input2(in2, input2_win);

257

Iterator output(out, win);

258

259

const int window_step_x = 16;

260

const auto window_start_x = static_cast<int>(window.x().start());

261

const auto window_end_x = static_cast<int>(window.x().end());

262

263

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

264

265

execute_window_loop(win, [&](const Coordinates &)

266

{

267

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

268

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

269

const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());

270

271

// Compute window_step_x elements per iteration

272

int x = window_start_x;

273

for(; x <= (window_end_x - window_step_x); x += window_step_x)

274

{

275

const qsymm16x8x2_t input1_q =

276

{

277

{

278

vld1q_s16(input1_ptr + x),

279

vld1q_s16(input1_ptr + x + 8),

280

}

281

};

282

const qsymm16x8x2_t input2_q =

283

{

284

{

285

vld1q_s16(input2_ptr + x),

286

vld1q_s16(input2_ptr + x + 8),

}

};

// Dequantize inputs

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

292

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

293

294

const float32x4x4_t out_f32x4x4 =

295

{

296

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

297

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

298

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

299

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

300

};

301

302

const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);

303

vst1q_s16(output_ptr + x, result.val[0]);

304

vst1q_s16(output_ptr + x + 8, result.val[1]);

305

}

306

307

// Compute left-over elements

308

for(; x < window_end_x; ++x)

309

{

310

// Dequantize inputs

311

float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;

312

float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;

313

float tmp_f = tmp_in1 * tmp_in2;

314

315

// Quantize output, lrintf() has same rounding mode as vcombine_s16

316

int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);

317

qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

318

*(output_ptr + x) = tmp_qua;

319

}

320

},

321

input1, input2, output);

322

}

323

324

void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

325

{

326

ARM_COMPUTE_UNUSED(scale);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

327

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

328

// Create input windows

329

Window win = window;

330

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

331

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

332

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

333

// Clear X Dimension on execution window as we handle manually

334

win.set(Window::DimX, Window::Dimension(0, 1, 1));

335

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

336

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

337

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

338

Iterator input1(in1, input1_win);

339

Iterator input2(in2, input2_win);

340

Iterator output(out, win);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

341

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

342

const int window_step_x = 16;

343

const auto window_start_x = static_cast<int>(window.x().start());

344

const auto window_end_x = static_cast<int>(window.x().end());

345

346

execute_window_loop(win, [&](const Coordinates &)

347

{

348

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

349

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

350

const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());

351

352

// Compute window_step_x elements per iteration

353

int x = window_start_x;

354

for(; x <= (window_end_x - window_step_x); x += window_step_x)

355

{

356

const qsymm16x8x2_t input1_q =

357

{

358

{

359

vld1q_s16(input1_ptr + x),

360

vld1q_s16(input1_ptr + x + 8),

361

}

362

};

363

const qsymm16x8x2_t input2_q =

364

{

365

{

366

vld1q_s16(input2_ptr + x),

367

vld1q_s16(input2_ptr + x + 8),

}

};

const int32x4x4_t in1_s32 =

372

{

373

{

374

vmovl_s16(vget_low_s16(input1_q.val[0])),

375

vmovl_s16(vget_high_s16(input1_q.val[0])),

376

vmovl_s16(vget_low_s16(input1_q.val[1])),

377

vmovl_s16(vget_high_s16(input1_q.val[1])),

378

}

379

};

380

const int32x4x4_t in2_s32 =

381

{

382

{

383

vmovl_s16(vget_low_s16(input2_q.val[0])),

384

vmovl_s16(vget_high_s16(input2_q.val[0])),

385

vmovl_s16(vget_low_s16(input2_q.val[1])),

386

vmovl_s16(vget_high_s16(input2_q.val[1])),

}

};

const int32x4x4_t result =

391

{

392

{

393

vmulq_s32(in1_s32.val[0], in2_s32.val[0]),

394

vmulq_s32(in1_s32.val[1], in2_s32.val[1]),

395

vmulq_s32(in1_s32.val[2], in2_s32.val[2]),

396

vmulq_s32(in1_s32.val[3], in2_s32.val[3]),

}

};

vst1q_s32(output_ptr + x, result.val[0]);

401

vst1q_s32(output_ptr + x + 4, result.val[1]);

402

vst1q_s32(output_ptr + x + 8, result.val[2]);

403

vst1q_s32(output_ptr + x + 12, result.val[3]);

404

}

405

406

// Compute left-over elements

407

for(; x < window_end_x; ++x)

408

{

409

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

410

*(output_ptr + x) = tmp;

411

}

412

},

413

input1, input2, output);

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

414

}

415

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

416

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

417

void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

418

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

419

// Create input windows

420

Window win = window;

421

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

422

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

423

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

424

// Clear X Dimension on execution window as we handle manually

425

win.set(Window::DimX, Window::Dimension(0, 1, 1));

426

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

427

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

428

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

429

Iterator input1(in1, input1_win);

430

Iterator input2(in2, input2_win);

431

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

432

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

433

const int window_step_x = 16 / sizeof(uint8_t);

434

const auto window_start_x = static_cast<int>(window.x().start());

435

const auto window_end_x = static_cast<int>(window.x().end());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

436

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

437

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

438

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

439

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

440

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

441

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

442

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

443

// Compute window_step_x elements per iteration

444

int x = window_start_x;

445

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

446

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

447

const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);

448

const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

449

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

450

uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));

451

const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));

452

uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));

453

const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));

454

455

tmp1_high = vmulq_u16(tmp1_high, tmp2_high);

456

tmp1_low = vmulq_u16(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_U16_U16(tmp1_high);

461

tmp1_low = scale255_U16_U16(tmp1_low);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp1_high = vqshlq_u16(tmp1_high, vn);

470

tmp1_low = vqshlq_u16(tmp1_low, vn);

}

else

{

tmp1_high = vshlq_u16(tmp1_high, vn);

475

tmp1_low = vshlq_u16(tmp1_low, vn);

}

}

if(is_sat)

{

vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));

}

else

{

vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));

}

}

// Compute left-over elements

489

for(; x < window_end_x; ++x)

490

{

491

uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

496

tmp = static_cast<uint16_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat && tmp > 255)

{

tmp = 255;

}

*(output_ptr + x) = static_cast<uint8_t>(tmp);

507

}

508

},

509

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

510

}

511

512

template <bool is_scale255, bool is_sat>

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

513

inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)

514

{

515

int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));

516

const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));

517

int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));

518

const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));

519

520

tmp1_high = vmulq_s32(tmp1_high, tmp2_high);

521

tmp1_low = vmulq_s32(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_S32_S32(tmp1_high);

526

tmp1_low = scale255_S32_S32(tmp1_low);

}

else

{

// Right shift amount

531

const int32x4_t vn = vdupq_n_s32(-n);

532

// Left shift amount

533

const int32x4_t vnl = vdupq_n_s32(n);

534

// Calculate conversion bit

535

const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);

536

const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);

537

const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);

538

const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);

539

const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);

540

const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);

541

const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);

542

const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);

543

if(is_sat)

544

{

545

tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

546

tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

else

{

tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

551

tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

}

if(is_sat)

{

return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));

}

else

{

return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));

}

}

template <bool is_scale255, bool is_sat>

566

inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)

567

{

568

const int16x8x2_t result =

{

{

// First 8 elements

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),

573

// Second 8 elements

574

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)

}

};

return result;

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

582

void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

583

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

584

// Create input windows

585

Window win = window;

586

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

587

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

588

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

589

// Clear X Dimension on execution window as we handle manually

590

win.set(Window::DimX, Window::Dimension(0, 1, 1));

591

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

592

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

593

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

594

Iterator input1(in1, input1_win);

595

Iterator input2(in2, input2_win);

596

Iterator output(out, win);

597

598

const int window_step_x = 16;

599

const auto window_start_x = static_cast<int>(window.x().start());

600

const auto window_end_x = static_cast<int>(window.x().end());

601

602

execute_window_loop(win, [&](const Coordinates &)

603

{

604

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

605

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

606

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

607

608

// Compute window_step_x elements per iteration

609

int x = window_start_x;

610

for(; x <= (window_end_x - window_step_x); x += window_step_x)

611

{

612

const int16x8x2_t ta1 =

613

{

614

{

615

vld1q_s16(input1_ptr + x),

616

vld1q_s16(input1_ptr + x + 8),

617

}

618

};

619

const int16x8x2_t ta2 =

620

{

621

{

622

vld1q_s16(input2_ptr + x),

623

vld1q_s16(input2_ptr + x + 8),

624

}

625

};

626

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

627

628

vst1q_s16(output_ptr + x, result.val[0]);

629

vst1q_s16(output_ptr + x + 8, result.val[1]);

630

}

631

632

// Compute left-over elements

633

for(; x < window_end_x; ++x)

634

{

635

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

640

641

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

652

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

658

}

659

*(output_ptr + x) = static_cast<int16_t>(tmp);

660

}

661

},

662

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

663

}

664

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

665

void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

666

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

667

// Create input windows

668

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

669

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

670

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

671

// Clear X Dimension on execution window as we handle manually

672

Window win = window;

673

win.set(Window::DimX, Window::Dimension(0, 1, 1));

674

675

constexpr int window_step_x = 16 / sizeof(float);

676

const auto window_start_x = static_cast<int>(window.x().start());

677

const auto window_end_x = static_cast<int>(window.x().end());

678

const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);

679

680

Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));

681

Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));

682

Iterator output(out, window);

683

684

using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;

685

686

if(is_broadcast_across_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

687

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

688

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

689

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

690

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

691

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

692

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

693

694

// Clear X Dimension on execution window as we handle manually

695

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

696

697

Iterator broadcast_input(broadcast_tensor, broadcast_win);

698

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

699

Iterator output(out, win);

700

701

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

702

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

703

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

704

const auto output_ptr = reinterpret_cast<float *>(output.ptr());

705

706

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

707

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

708

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

709

710

// Compute window_step_x elements per iteration

711

int x = window_start_x;

712

for(; x <= (window_end_x - window_step_x); x += window_step_x)

713

{

714

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

715

auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);

716

wrapper::vstore(output_ptr + x, res);

717

}

718

719

// Compute left-over elements

720

for(; x < window_end_x; ++x)

721

{

722

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

723

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

724

}

725

},

726

broadcast_input, non_broadcast_input, output);

}

else

{

// Clear X Dimension on execution window as we handle manually

731

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

732

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

733

734

Iterator input1(in1, input1_win);

735

Iterator input2(in2, input2_win);

736

Iterator output(out, win);

737

738

execute_window_loop(win, [&](const Coordinates &)

739

{

740

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

741

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

742

const auto output_ptr = reinterpret_cast<float *>(output.ptr());

743

744

// Compute window_step_x elements per iteration

745

int x = window_start_x;

746

for(; x <= (window_end_x - window_step_x); x += window_step_x)

747

{

748

const auto ta1 = wrapper::vloadq(input1_ptr + x);

749

const auto ta2 = wrapper::vloadq(input2_ptr + x);

750

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

751

const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);

752

wrapper::vstore(output_ptr + x, res);

753

}

754

755

// Compute left-over elements

756

for(; x < window_end_x; ++x)

757

{

758

const auto ta1 = *(input1_ptr + x);

759

const auto ta2 = *(input2_ptr + x);

760

*(output_ptr + x) = ta1 * ta2 * scale;

761

}

762

},

763

input1, input2, output);

764

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

765

}

766

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

767

void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)

768

{

769

const auto input1 = static_cast<const float *__restrict>(input1_ptr);

770

const auto input2 = static_cast<const float *__restrict>(input2_ptr);

771

const auto output = static_cast<float *__restrict>(output_ptr);

772

773

const float32x4_t a = wrapper::vloadq(input1);

774

float32x4_t b = wrapper::vloadq(input2);

775

776

using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;

777

778

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

779

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

780

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

781

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

782

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

783

784

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

785

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

786

787

float32x4_t res = wrapper::vmul(tmp0, b);

788

789

b = wrapper::vrev64(b);

790

b = wrapper::vmul(b, mask);

791

792

res = wrapper::vmla(res, tmp1, b);

793

wrapper::vstore(output, res);

794

}

795

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

796

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

797

void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

798

{

799

// Create input windows

800

Window win = window;

801

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

802

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

803

804

// Clear X Dimension on execution window as we handle manually

805

win.set(Window::DimX, Window::Dimension(0, 1, 1));

806

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

807

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

808

809

Iterator input1(in1, input1_win);

810

Iterator input2(in2, input2_win);

811

Iterator output(out, win);

812

813

const int window_step_x = 16;

814

const auto window_start_x = static_cast<int>(window.x().start());

815

const auto window_end_x = static_cast<int>(window.x().end());

816

817

execute_window_loop(win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

818

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

819

const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());

820

const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());

821

const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());

822

823

// Compute window_step_x elements per iteration

824

int x = window_start_x;

825

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

826

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

827

const float16x8x2_t ta1 =

828

{

829

{

830

vld1q_f16(input1_ptr + x),

831

vld1q_f16(input1_ptr + x + 8),

832

}

833

};

834

const float16x8x2_t ta2 =

835

{

836

{

837

vld1q_f16(input2_ptr + x),

838

vld1q_f16(input2_ptr + x + 8),

839

}

840

};

841

const float16x8_t scale_vec = vdupq_n_f16(scale);

842

const float16x8x2_t result =

843

{

844

{

845

vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),

846

vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),

847

}

848

};

849

vst1q_f16(output_ptr + x, result.val[0]);

850

vst1q_f16(output_ptr + x + 8, result.val[1]);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

851

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

852

853

// Compute left-over elements

854

for(; x < window_end_x; ++x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

855

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

856

const auto ta1 = *(input1_ptr + x);

857

const auto ta2 = *(input2_ptr + x);

858

*(output_ptr + x) = ta1 * ta2 * scale;

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

859

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

860

},

861

input1, input2, output);

862

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

863

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

df24618

2017-07-03 16:25:09 +0100

[diff] [blame]

864

865

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

866

void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

867

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

868

// Create input windows

869

Window win = window;

870

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

871

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

872

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

873

// Clear X Dimension on execution window as we handle manually

874

win.set(Window::DimX, Window::Dimension(0, 1, 1));

875

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

876

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

877

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

878

Iterator input1(in1, input1_win);

879

Iterator input2(in2, input2_win);

880

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

881

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

882

const int window_step_x = 16 / sizeof(uint8_t);

883

const auto window_start_x = static_cast<int>(window.x().start());

884

const auto window_end_x = static_cast<int>(window.x().end());

885

886

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

887

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

888

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

889

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

890

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

891

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

892

// Compute window_step_x elements per iteration

893

int x = window_start_x;

894

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

895

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

896

const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);

897

const uint8x16_t av = wrapper::vloadq(input1_ptr + x);

898

899

uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));

900

uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));

901

tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));

902

tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));

if(is_scale255)

{

tmp_low = scale255_U16_U16(tmp_low);

907

tmp_high = scale255_U16_U16(tmp_high);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp_low = vqshlq_u16(tmp_low, vn);

916

tmp_high = vqshlq_u16(tmp_high, vn);

}

else

{

tmp_low = vshlq_u16(tmp_low, vn);

921

tmp_high = vshlq_u16(tmp_high, vn);

}

}

if(is_sat)

{

static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);

928

929

tmp_low = vminq_u16(tmp_low, max);

930

tmp_high = vminq_u16(tmp_high, max);

931

}

932

933

vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));

934

vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

935

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

936

937

// Compute left-over elements

938

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

939

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

940

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

945

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;

955

}

956

957

*(output_ptr + x) = static_cast<int16_t>(tmp);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

958

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

959

},

960

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

961

}

962

963

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

964

void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

965

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

966

// Create input windows

967

Window win = window;

968

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

969

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

970

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

971

// Clear X Dimension on execution window as we handle manually

972

win.set(Window::DimX, Window::Dimension(0, 1, 1));

973

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

974

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

975

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

976

Iterator input1(in1, input1_win);

977

Iterator input2(in2, input2_win);

978

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

979

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

980

const int window_step_x = 16;

981

const auto window_start_x = static_cast<int>(window.x().start());

982

const auto window_end_x = static_cast<int>(window.x().end());

983

984

execute_window_loop(win, [&](const Coordinates &)

985

{

986

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

987

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

988

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

989

990

// Compute window_step_x elements per iteration

991

int x = window_start_x;

992

for(; x <= (window_end_x - window_step_x); x += window_step_x)

993

{

994

const int16x8x2_t ta1 =

995

{

996

{

997

vld1q_s16(input1_ptr + x),

998

vld1q_s16(input1_ptr + x + 8),

999

}

1000

};

1001

const uint8x8x2_t ta2u =

1002

{

1003

{

1004

vld1_u8(input2_ptr + x),

1005

vld1_u8(input2_ptr + x + 8),

1006

}

1007

};

1008

const int16x8x2_t ta2 =

1009

{

1010

{

1011

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),

1012

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))

}

};

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

1017

1018

vst1q_s16(output_ptr + x, result.val[0]);

1019

vst1q_s16(output_ptr + x + 8, result.val[1]);

1020

}

1021

1022

// Compute left-over elements

1023

for(; x < window_end_x; ++x)

1024

{

1025

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1030

1031

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

1042

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

1048

}

1049

*(output_ptr + x) = static_cast<int16_t>(tmp);

1050

}

1051

},

1052

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1053

}

1054

1055

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1056

void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1057

{

1058

// Simply swap the two input buffers

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1059

mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

} // namespace

NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1064

: _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1068

void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1069

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1070

ARM_COMPUTE_UNUSED(rounding_policy);

Georgios Pinitas

f0dea70

2017-07-03 18:17:28 +0100

[diff] [blame]

1071

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

1072

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1073

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1074

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1075

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1076

const TensorShape &out_shape = broadcast_pair.first;

1077

const ValidRegion &valid_region = broadcast_pair.second;

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1078

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1079

// Auto initialize output if not initialized

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1080

set_shape_if_empty(*output, out_shape);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1081

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1082

_scale = scale;

1083

_scale_exponent = 0;

1084

_func_quantized = nullptr;

1085

_func_int = nullptr;

1086

_func_float = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1087

1088

bool is_scale_255 = false;

1089

// Check and validate scaling factor

1090

if(std::abs(scale - scale255_constant) < 0.00001f)

1091

{

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

is_scale_255 = true;

}

else

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1096

int exponent = 0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1097

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1098

std::frexp(scale, &exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1099

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1100

// Store the positive exponent. We know that we compute 1/2^n

1101

// Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5

1102

_scale_exponent = std::abs(exponent - 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1103

}

1104

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1105

const DataType dt_input1 = input1->data_type();

1106

const DataType dt_input2 = input2->data_type();

1107

const DataType dt_output = output->data_type();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1108

const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);

1109

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1110

switch(dt_input1)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1111

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1112

case DataType::QASYMM8:

1113

if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)

1114

{

1115

_func_quantized = &mul_saturate_quantized_8<uint8_t>;

1116

}

1117

break;

1118

case DataType::QASYMM8_SIGNED:

1119

if(dt_input2 == DataType::QASYMM8_SIGNED)

1120

{

1121

_func_quantized = &mul_saturate_quantized_8<int8_t>;

;

}

break;

case DataType::QSYMM16:

1126

if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)

1127

{

1128

_func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;

1129

}

1130

else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)

1131

{

1132

_func_int = &mul_QSYMM16_QSYMM16_S32;

}

break;

case DataType::S16:

if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;

1145

}

1146

}

1147

if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;

}

}

break;

case DataType::U8:

if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;

1169

}

1170

}

1171

else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;

1180

}

1181

}

1182

else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;

}

}

break;

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1195

case DataType::F16:

1196

_func_float = &mul_F16_F16_F16;

1197

break;

1198

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1199

case DataType::F32:

1200

_func_float = &mul_F32_F32_F32;

1201

break;

1202

default:

1203

ARM_COMPUTE_ERROR("You called with the wrong img formats");

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1204

}

1205

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1206

// Configure kernel window

1207

Coordinates coord;

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1208

coord.set_num_dimensions(output->num_dimensions());

1209

output->set_valid_region(valid_region);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1210

Window win = calculate_max_window(valid_region, Steps());

1211

1212

INEKernel::configure(win);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1213

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1214

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1215

Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,

1216

RoundingPolicy rounding_policy)

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1217

{

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1218

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1219

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1220

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1221

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1222

}

1223

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1224

void NEPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1225

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1226

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1227

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1228

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1229

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1230

auto input1 = inputs.at(TensorType::ACL_SRC_0);

1231

auto input2 = inputs.at(TensorType::ACL_SRC_1);

1232

auto output = outputs.at(TensorType::ACL_DST);

1233

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1234

if(_func_quantized != nullptr)

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1235

{

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1236

(*_func_quantized)(input1, input2, output, window, _scale);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1237

}

1238

else if(_func_int != nullptr)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1239

{

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1240

(*_func_int)(input1, input2, output, window, _scale_exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1241

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1242

else

1243

{

1244

ARM_COMPUTE_ERROR_ON(_func_float == nullptr);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1245

(*_func_float)(input1, input2, output, window, _scale);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1246

}

1247

}

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1248

namespace

1249

{

1250

constexpr unsigned int num_elems_processed_per_iteration_complex = 2;

1251

1252

Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)

1253

{

1254

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);

1255

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);

1256

1257

const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());

1258

1259

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

1260

1261

// Validate in case of configured output

1262

if(output->total_size() > 0)

1263

{

1264

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);

1265

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");

}

return Status{};

}

std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)

1272

{

1273

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

1274

const TensorShape &out_shape = broadcast_pair.first;

1275

const ValidRegion &valid_region = broadcast_pair.second;

1276

1277

// Auto initialize output if not initialized

1278

const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());

1279

auto_init_if_empty(*output, out_info);

1280

1281

Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));

1282

Window win_input1 = win.broadcast_if_dimension_le_one(*input1);

1283

Window win_input2 = win.broadcast_if_dimension_le_one(*input2);

1284

1285

AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);

1286

AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);

1287

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);

1288

1289

bool window_changed = update_window_and_padding(win_input1, input1_access)

1290

|| update_window_and_padding(win_input2, input2_access)

1291

|| update_window_and_padding(win, output_access);

1292

1293

output_access.set_valid_region(win, valid_region);

1294

1295

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

1296

return std::make_pair(err, win);

}

} // namespace

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1300

void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1301

{

1302

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1303

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1304

1305

// Configure kernel window

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1306

auto win_config = validate_and_configure_window_complex(input1, input2, output);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1307

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

1308

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1309

// Create kernel

1310

INEKernel::configure(win_config.second);

1311

}

1312

1313

Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)

1314

{

1315

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

1316

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));

1317

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);

return Status{};

}

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1322

void NEComplexPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)

giuros01