Blame - src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp - ml/ComputeLibrary

2017-11-28 18:29:43 +0000

[diff] [blame]

47

{

48

ARM_COMPUTE_UNUSED(overflow_policy);

49

ARM_COMPUTE_UNUSED(rounding_policy);

50

Anthony Barbier

eaefd00

2018-07-20 17:49:35 +0100

[diff] [blame]

51

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

52

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);

53

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

54

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

55

DataType::S16, DataType::QSYMM16,

56

DataType::S32, DataType::F16, DataType::F32);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

57

if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

58

{

59

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

60

ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

61

}

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

62

63

if(output->total_size() > 0)

64

{

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

65

const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());

66

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");

67

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

68

69

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),

70

"Output can only be U8 if both inputs are U8");

Michele Di Giorgio

f9b595a

2020-07-03 13:34:52 +0100

[diff] [blame]

71

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),

72

"Output can only be QASYMM8 if both inputs are QASYMM8");

73

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

74

"Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");

Michele Di Giorgio

f9b595a

2020-07-03 13:34:52 +0100

[diff] [blame]

75

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),

76

"Output can only be QSYMM16 if both inputs are QSYMM16");

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

77

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),

78

"Output can only be S32 if both inputs are QSYMM16");

79

ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

80

}

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

81

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

82

if(std::abs(scale - scale255_constant) < 0.00001f)

83

{

84

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);

89

90

int exponent = 0;

91

const float normalized_mantissa = std::frexp(scale, &exponent);

92

93

// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15

94

// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14

95

// Moreover, it will be negative as we deal with 1/2^n

96

ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");

97

}

98

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

99

return Status{};

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

100

}

101

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

102

/* Scales a given vector by 1/255.

103

*

104

* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.

105

*

106

* @param in Input vector to scale.

107

* @return Scaled output rounded to nearest (round half up).

108

*/

109

inline int32x4_t scale255_S32_S32(int32x4_t in)

110

{

111

// Scale

112

const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);

113

// Round to nearest (round half up)

114

// Add +0.5 for all values

115

// Afterwards vcvt rounds toward zero

116

return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));

117

}

118

119

inline uint16x8_t scale255_U16_U16(uint16x8_t in)

120

{

121

const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));

122

const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));

123

return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));

124

}

125

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

126

template <typename T>

127

inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type

128

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

129

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

130

return vquantize_signed(val, info);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

131

}

132

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

133

template <typename T>

134

inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type

135

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

136

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

137

return vquantize(val, info);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

138

}

139

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

140

template <typename T>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

141

void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

142

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

143

// Create input windows

144

Window win = window;

145

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

146

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

147

148

// Clear X Dimension on execution window as we handle manually

149

win.set(Window::DimX, Window::Dimension(0, 1, 1));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

150

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

151

const int window_step_x = 16 / sizeof(T);

152

const auto window_start_x = static_cast<int>(window.x().start());

153

const auto window_end_x = static_cast<int>(window.x().end());

154

const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

155

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

156

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

157

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

158

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

159

if(is_broadcast_across_x)

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

160

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

161

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

162

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

163

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

164

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

165

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

166

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

167

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

168

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

169

// Clear X Dimension on execution window as we handle manually

170

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

171

172

Iterator broadcast_input(broadcast_tensor, broadcast_win);

173

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

174

Iterator output(out, win);

175

176

using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;

177

178

execute_window_loop(win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

179

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

180

const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());

181

const auto output_ptr = reinterpret_cast<T *>(output.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

182

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

183

const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());

184

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

185

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

186

// Compute window_step_x elements per iteration

187

int x = window_start_x;

188

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

189

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

190

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

191

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

192

// Dequantize inputs

193

const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);

194

const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

195

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

196

const float32x4x4_t out_f32x4x4 =

197

{

198

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

199

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

200

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

201

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

};

// Quantize output

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

206

wrapper::vstore(output_ptr + x, result);

207

}

208

209

// Compute left-over elements

210

for(; x < window_end_x; ++x)

211

{

212

// Dequantize inputs

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

213

const T in1 = *(non_broadcast_input_ptr + x);

214

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, non_broadcast_qinfo);

215

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);

216

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

217

218

// Quantize output

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

219

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

220

*(output_ptr + x) = tmp_qua;

221

}

222

},

223

broadcast_input, non_broadcast_input, output);

}

else

{

const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();

228

const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();

229

230

// Clear X Dimension on execution window as we handle manually

231

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

232

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

233

234

Iterator input1(in1, input1_win);

235

Iterator input2(in2, input2_win);

236

Iterator output(out, win);

237

238

execute_window_loop(win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

239

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

240

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

241

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

242

const auto output_ptr = reinterpret_cast<T *>(output.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

243

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

244

// Compute window_step_x elements per iteration

245

int x = window_start_x;

246

for(; x <= (window_end_x - window_step_x); x += window_step_x)

247

{

248

const auto input1_q = wrapper::vloadq(input1_ptr + x);

249

const auto input2_q = wrapper::vloadq(input2_ptr + x);

250

251

// Dequantize inputs

252

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

253

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

254

255

const float32x4x4_t out_f32x4x4 =

256

{

257

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

258

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

259

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

260

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

};

// Quantize output

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

265

wrapper::vstore(output_ptr + x, result);

266

}

267

268

// Compute left-over elements

269

for(; x < window_end_x; ++x)

270

{

271

// Dequantize inputs

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

272

const T in1 = *(input1_ptr + x);

273

const T in2 = *(input2_ptr + x);

274

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, input1_qua_info);

275

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(in2, input2_qua_info);

276

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

277

278

// Quantize output

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

279

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

280

*(output_ptr + x) = tmp_qua;

281

}

282

},

283

input1, input2, output);

284

}

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

285

}

286

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

287

void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

288

{

289

const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();

290

const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();

291

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

292

293

// Create input windows

294

Window win = window;

295

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

296

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

297

298

// Clear X Dimension on execution window as we handle manually

299

win.set(Window::DimX, Window::Dimension(0, 1, 1));

300

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

301

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

302

303

Iterator input1(in1, input1_win);

304

Iterator input2(in2, input2_win);

305

Iterator output(out, win);

306

307

const int window_step_x = 16;

308

const auto window_start_x = static_cast<int>(window.x().start());

309

const auto window_end_x = static_cast<int>(window.x().end());

310

311

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

312

313

execute_window_loop(win, [&](const Coordinates &)

314

{

315

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

316

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

317

const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());

318

319

// Compute window_step_x elements per iteration

320

int x = window_start_x;

321

for(; x <= (window_end_x - window_step_x); x += window_step_x)

322

{

323

const qsymm16x8x2_t input1_q =

324

{

325

{

326

vld1q_s16(input1_ptr + x),

327

vld1q_s16(input1_ptr + x + 8),

328

}

329

};

330

const qsymm16x8x2_t input2_q =

331

{

332

{

333

vld1q_s16(input2_ptr + x),

334

vld1q_s16(input2_ptr + x + 8),

}

};

// Dequantize inputs

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

340

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

341

342

const float32x4x4_t out_f32x4x4 =

343

{

344

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

345

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

346

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

347

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

348

};

349

350

const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);

351

vst1q_s16(output_ptr + x, result.val[0]);

352

vst1q_s16(output_ptr + x + 8, result.val[1]);

353

}

354

355

// Compute left-over elements

356

for(; x < window_end_x; ++x)

357

{

358

// Dequantize inputs

359

float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;

360

float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;

361

float tmp_f = tmp_in1 * tmp_in2;

362

363

// Quantize output, lrintf() has same rounding mode as vcombine_s16

364

int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);

365

qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

366

*(output_ptr + x) = tmp_qua;

367

}

368

},

369

input1, input2, output);

370

}

371

372

void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

373

{

374

ARM_COMPUTE_UNUSED(scale);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

375

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

376

// Create input windows

377

Window win = window;

378

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

379

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

380

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

381

// Clear X Dimension on execution window as we handle manually

382

win.set(Window::DimX, Window::Dimension(0, 1, 1));

383

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

384

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

385

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

386

Iterator input1(in1, input1_win);

387

Iterator input2(in2, input2_win);

388

Iterator output(out, win);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

389

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

390

const int window_step_x = 16;

391

const auto window_start_x = static_cast<int>(window.x().start());

392

const auto window_end_x = static_cast<int>(window.x().end());

393

394

execute_window_loop(win, [&](const Coordinates &)

395

{

396

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

397

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

398

const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());

399

400

// Compute window_step_x elements per iteration

401

int x = window_start_x;

402

for(; x <= (window_end_x - window_step_x); x += window_step_x)

403

{

404

const qsymm16x8x2_t input1_q =

405

{

406

{

407

vld1q_s16(input1_ptr + x),

408

vld1q_s16(input1_ptr + x + 8),

409

}

410

};

411

const qsymm16x8x2_t input2_q =

412

{

413

{

414

vld1q_s16(input2_ptr + x),

415

vld1q_s16(input2_ptr + x + 8),

}

};

const int32x4x4_t in1_s32 =

420

{

421

{

422

vmovl_s16(vget_low_s16(input1_q.val[0])),

423

vmovl_s16(vget_high_s16(input1_q.val[0])),

424

vmovl_s16(vget_low_s16(input1_q.val[1])),

425

vmovl_s16(vget_high_s16(input1_q.val[1])),

426

}

427

};

428

const int32x4x4_t in2_s32 =

429

{

430

{

431

vmovl_s16(vget_low_s16(input2_q.val[0])),

432

vmovl_s16(vget_high_s16(input2_q.val[0])),

433

vmovl_s16(vget_low_s16(input2_q.val[1])),

434

vmovl_s16(vget_high_s16(input2_q.val[1])),

}

};

const int32x4x4_t result =

439

{

440

{

441

vmulq_s32(in1_s32.val[0], in2_s32.val[0]),

442

vmulq_s32(in1_s32.val[1], in2_s32.val[1]),

443

vmulq_s32(in1_s32.val[2], in2_s32.val[2]),

444

vmulq_s32(in1_s32.val[3], in2_s32.val[3]),

}

};

vst1q_s32(output_ptr + x, result.val[0]);

449

vst1q_s32(output_ptr + x + 4, result.val[1]);

450

vst1q_s32(output_ptr + x + 8, result.val[2]);

451

vst1q_s32(output_ptr + x + 12, result.val[3]);

452

}

453

454

// Compute left-over elements

455

for(; x < window_end_x; ++x)

456

{

457

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

458

*(output_ptr + x) = tmp;

459

}

460

},

461

input1, input2, output);

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

462

}

463

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

464

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

465

void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

466

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

467

// Create input windows

468

Window win = window;

469

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

470

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

471

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

472

// Clear X Dimension on execution window as we handle manually

473

win.set(Window::DimX, Window::Dimension(0, 1, 1));

474

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

475

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

476

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

477

Iterator input1(in1, input1_win);

478

Iterator input2(in2, input2_win);

479

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

480

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

481

const int window_step_x = 16 / sizeof(uint8_t);

482

const auto window_start_x = static_cast<int>(window.x().start());

483

const auto window_end_x = static_cast<int>(window.x().end());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

484

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

485

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

486

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

487

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

488

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

489

const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

490

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

491

// Compute window_step_x elements per iteration

492

int x = window_start_x;

493

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

494

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

495

const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);

496

const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

497

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

498

uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));

499

const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));

500

uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));

501

const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));

502

503

tmp1_high = vmulq_u16(tmp1_high, tmp2_high);

504

tmp1_low = vmulq_u16(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_U16_U16(tmp1_high);

509

tmp1_low = scale255_U16_U16(tmp1_low);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp1_high = vqshlq_u16(tmp1_high, vn);

518

tmp1_low = vqshlq_u16(tmp1_low, vn);

}

else

{

tmp1_high = vshlq_u16(tmp1_high, vn);

523

tmp1_low = vshlq_u16(tmp1_low, vn);

}

}

if(is_sat)

{

vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));

}

else

{

vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));

}

}

// Compute left-over elements

537

for(; x < window_end_x; ++x)

538

{

539

uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

544

tmp = static_cast<uint16_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat && tmp > 255)

{

tmp = 255;

}

*(output_ptr + x) = static_cast<uint8_t>(tmp);

555

}

556

},

557

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

558

}

559

560

template <bool is_scale255, bool is_sat>

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

561

inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)

562

{

563

int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));

564

const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));

565

int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));

566

const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));

567

568

tmp1_high = vmulq_s32(tmp1_high, tmp2_high);

569

tmp1_low = vmulq_s32(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_S32_S32(tmp1_high);

574

tmp1_low = scale255_S32_S32(tmp1_low);

}

else

{

// Right shift amount

579

const int32x4_t vn = vdupq_n_s32(-n);

580

// Left shift amount

581

const int32x4_t vnl = vdupq_n_s32(n);

582

// Calculate conversion bit

583

const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);

584

const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);

585

const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);

586

const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);

587

const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);

588

const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);

589

const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);

590

const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);

591

if(is_sat)

592

{

593

tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

594

tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

else

{

tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

599

tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

}

if(is_sat)

{

return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));

}

else

{

return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));

}

}

template <bool is_scale255, bool is_sat>

614

inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)

615

{

616

const int16x8x2_t result =

{

{

// First 8 elements

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),

621

// Second 8 elements

622

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)

}

};

return result;

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

630

void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

631

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

632

// Create input windows

633

Window win = window;

634

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

635

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

636

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

637

// Clear X Dimension on execution window as we handle manually

638

win.set(Window::DimX, Window::Dimension(0, 1, 1));

639

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

640

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

641

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

642

Iterator input1(in1, input1_win);

643

Iterator input2(in2, input2_win);

644

Iterator output(out, win);

645

646

const int window_step_x = 16;

647

const auto window_start_x = static_cast<int>(window.x().start());

648

const auto window_end_x = static_cast<int>(window.x().end());

649

650

execute_window_loop(win, [&](const Coordinates &)

651

{

652

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

653

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

654

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

655

656

// Compute window_step_x elements per iteration

657

int x = window_start_x;

658

for(; x <= (window_end_x - window_step_x); x += window_step_x)

659

{

660

const int16x8x2_t ta1 =

661

{

662

{

663

vld1q_s16(input1_ptr + x),

664

vld1q_s16(input1_ptr + x + 8),

665

}

666

};

667

const int16x8x2_t ta2 =

668

{

669

{

670

vld1q_s16(input2_ptr + x),

671

vld1q_s16(input2_ptr + x + 8),

672

}

673

};

674

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

675

676

vst1q_s16(output_ptr + x, result.val[0]);

677

vst1q_s16(output_ptr + x + 8, result.val[1]);

678

}

679

680

// Compute left-over elements

681

for(; x < window_end_x; ++x)

682

{

683

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

688

689

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

700

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

706

}

707

*(output_ptr + x) = static_cast<int16_t>(tmp);

708

}

709

},

710

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

711

}

712

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

713

void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

714

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

715

// Create input windows

716

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

717

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

718

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

719

// Clear X Dimension on execution window as we handle manually

720

Window win = window;

721

win.set(Window::DimX, Window::Dimension(0, 1, 1));

722

723

constexpr int window_step_x = 16 / sizeof(float);

724

const auto window_start_x = static_cast<int>(window.x().start());

725

const auto window_end_x = static_cast<int>(window.x().end());

726

const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);

727

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

728

using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;

729

730

if(is_broadcast_across_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

731

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

732

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

733

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

734

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

735

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

736

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

737

738

// Clear X Dimension on execution window as we handle manually

739

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

740

741

Iterator broadcast_input(broadcast_tensor, broadcast_win);

742

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

743

Iterator output(out, win);

744

745

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

746

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

747

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

748

const auto output_ptr = reinterpret_cast<float *>(output.ptr());

749

750

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

751

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

752

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

753

754

// Compute window_step_x elements per iteration

755

int x = window_start_x;

756

for(; x <= (window_end_x - window_step_x); x += window_step_x)

757

{

758

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

759

auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);

760

wrapper::vstore(output_ptr + x, res);

761

}

762

763

// Compute left-over elements

764

for(; x < window_end_x; ++x)

765

{

766

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

767

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

768

}

769

},

770

broadcast_input, non_broadcast_input, output);

}

else

{

// Clear X Dimension on execution window as we handle manually

775

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

776

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

777

778

Iterator input1(in1, input1_win);

779

Iterator input2(in2, input2_win);

780

Iterator output(out, win);

781

782

execute_window_loop(win, [&](const Coordinates &)

783

{

784

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

785

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

786

const auto output_ptr = reinterpret_cast<float *>(output.ptr());

787

788

// Compute window_step_x elements per iteration

789

int x = window_start_x;

790

for(; x <= (window_end_x - window_step_x); x += window_step_x)

791

{

792

const auto ta1 = wrapper::vloadq(input1_ptr + x);

793

const auto ta2 = wrapper::vloadq(input2_ptr + x);

794

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

795

const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);

796

wrapper::vstore(output_ptr + x, res);

797

}

798

799

// Compute left-over elements

800

for(; x < window_end_x; ++x)

801

{

802

const auto ta1 = *(input1_ptr + x);

803

const auto ta2 = *(input2_ptr + x);

804

*(output_ptr + x) = ta1 * ta2 * scale;

805

}

806

},

807

input1, input2, output);

808

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

809

}

810

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

811

void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)

812

{

813

const auto input1 = static_cast<const float *__restrict>(input1_ptr);

814

const auto input2 = static_cast<const float *__restrict>(input2_ptr);

815

const auto output = static_cast<float *__restrict>(output_ptr);

816

817

const float32x4_t a = wrapper::vloadq(input1);

818

float32x4_t b = wrapper::vloadq(input2);

819

820

using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;

821

822

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

823

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

824

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

825

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

826

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

827

828

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

829

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

830

831

float32x4_t res = wrapper::vmul(tmp0, b);

832

833

b = wrapper::vrev64(b);

834

b = wrapper::vmul(b, mask);

835

836

res = wrapper::vmla(res, tmp1, b);

837

wrapper::vstore(output, res);

838

}

839

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

840

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

841

void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)

842

{

843

// Create input windows

844

Window win = window;

845

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

846

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

847

848

// Clear X Dimension on execution window as we handle manually

849

win.set(Window::DimX, Window::Dimension(0, 1, 1));

850

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

851

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

852

853

Iterator input1(in1, input1_win);

854

Iterator input2(in2, input2_win);

855

Iterator output(out, win);

856

857

const int window_step_x = 16;

858

const auto window_start_x = static_cast<int>(window.x().start());

859

const auto window_end_x = static_cast<int>(window.x().end());

860

861

execute_window_loop(win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

862

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

863

const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());

864

const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());

865

const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());

866

867

// Compute window_step_x elements per iteration

868

int x = window_start_x;

869

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

870

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

871

const float16x8x2_t ta1 =

872

{

873

{

874

vld1q_f16(input1_ptr + x),

875

vld1q_f16(input1_ptr + x + 8),

876

}

877

};

878

const float16x8x2_t ta2 =

879

{

880

{

881

vld1q_f16(input2_ptr + x),

882

vld1q_f16(input2_ptr + x + 8),

883

}

884

};

885

const float16x8_t scale_vec = vdupq_n_f16(scale);

886

const float16x8x2_t result =

887

{

888

{

889

vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),

890

vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),

891

}

892

};

893

vst1q_f16(output_ptr + x, result.val[0]);

894

vst1q_f16(output_ptr + x + 8, result.val[1]);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

895

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

896

897

// Compute left-over elements

898

for(; x < window_end_x; ++x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

899

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

900

const auto ta1 = *(input1_ptr + x);

901

const auto ta2 = *(input2_ptr + x);

902

*(output_ptr + x) = ta1 * ta2 * scale;

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

903

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

904

},

905

input1, input2, output);

906

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

907

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

df24618

2017-07-03 16:25:09 +0100

[diff] [blame]

908

909

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

910

void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

911

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

912

// Create input windows

913

Window win = window;

914

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

915

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

916

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

917

// Clear X Dimension on execution window as we handle manually

918

win.set(Window::DimX, Window::Dimension(0, 1, 1));

919

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

920

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

921

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

922

Iterator input1(in1, input1_win);

923

Iterator input2(in2, input2_win);

924

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

925

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

926

const int window_step_x = 16 / sizeof(uint8_t);

927

const auto window_start_x = static_cast<int>(window.x().start());

928

const auto window_end_x = static_cast<int>(window.x().end());

929

930

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

931

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

932

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

933

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

934

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

935

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

936

// Compute window_step_x elements per iteration

937

int x = window_start_x;

938

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

939

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

940

const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);

941

const uint8x16_t av = wrapper::vloadq(input1_ptr + x);

942

943

uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));

944

uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));

945

tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));

946

tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));

if(is_scale255)

{

tmp_low = scale255_U16_U16(tmp_low);

951

tmp_high = scale255_U16_U16(tmp_high);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp_low = vqshlq_u16(tmp_low, vn);

960

tmp_high = vqshlq_u16(tmp_high, vn);

}

else

{

tmp_low = vshlq_u16(tmp_low, vn);

965

tmp_high = vshlq_u16(tmp_high, vn);

}

}

if(is_sat)

{

static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);

972

973

tmp_low = vminq_u16(tmp_low, max);

974

tmp_high = vminq_u16(tmp_high, max);

975

}

976

977

vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));

978

vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

979

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

980

981

// Compute left-over elements

982

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

983

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

984

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

989

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;

999

}

1000

1001

*(output_ptr + x) = static_cast<int16_t>(tmp);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1002

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1003

},

1004

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1005

}

1006

1007

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1008

void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1009

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1010

// Create input windows

1011

Window win = window;

1012

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

1013

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1014

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1015

// Clear X Dimension on execution window as we handle manually

1016

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1017

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1018

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1019

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1020

Iterator input1(in1, input1_win);

1021

Iterator input2(in2, input2_win);

1022

Iterator output(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1023

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1024

const int window_step_x = 16;

1025

const auto window_start_x = static_cast<int>(window.x().start());

1026

const auto window_end_x = static_cast<int>(window.x().end());

1027

1028

execute_window_loop(win, [&](const Coordinates &)

1029

{

1030

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

1031

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

1032

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

1033

1034

// Compute window_step_x elements per iteration

1035

int x = window_start_x;

1036

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1037

{

1038

const int16x8x2_t ta1 =

1039

{

1040

{

1041

vld1q_s16(input1_ptr + x),

1042

vld1q_s16(input1_ptr + x + 8),

1043

}

1044

};

1045

const uint8x8x2_t ta2u =

1046

{

1047

{

1048

vld1_u8(input2_ptr + x),

1049

vld1_u8(input2_ptr + x + 8),

1050

}

1051

};

1052

const int16x8x2_t ta2 =

1053

{

1054

{

1055

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),

1056

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))

}

};

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

1061

1062

vst1q_s16(output_ptr + x, result.val[0]);

1063

vst1q_s16(output_ptr + x + 8, result.val[1]);

1064

}

1065

1066

// Compute left-over elements

1067

for(; x < window_end_x; ++x)

1068

{

1069

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1074

1075

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

1086

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

1092

}

1093

*(output_ptr + x) = static_cast<int16_t>(tmp);

1094

}

1095

},

1096

input1, input2, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1097

}

1098

1099

template <bool is_scale255, bool is_sat>

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1100

void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1101

{

1102

// Simply swap the two input buffers

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1103

mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

} // namespace

NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1108

: _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1112

void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1113

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1114

ARM_COMPUTE_UNUSED(rounding_policy);

Georgios Pinitas

f0dea70

2017-07-03 18:17:28 +0100

[diff] [blame]

1115

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

1116

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1117

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1118

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1119

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1120

const TensorShape &out_shape = broadcast_pair.first;

1121

const ValidRegion &valid_region = broadcast_pair.second;

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1122

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1123

// Auto initialize output if not initialized

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1124

set_shape_if_empty(*output, out_shape);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1125

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1126

_scale = scale;

1127

_scale_exponent = 0;

1128

_func_quantized = nullptr;

1129

_func_int = nullptr;

1130

_func_float = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1131

1132

bool is_scale_255 = false;

1133

// Check and validate scaling factor

1134

if(std::abs(scale - scale255_constant) < 0.00001f)

1135

{

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

is_scale_255 = true;

}

else

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1140

int exponent = 0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1141

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1142

std::frexp(scale, &exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1143

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1144

// Store the positive exponent. We know that we compute 1/2^n

1145

// Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5

1146

_scale_exponent = std::abs(exponent - 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1147

}

1148

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1149

const DataType dt_input1 = input1->data_type();

1150

const DataType dt_input2 = input2->data_type();

1151

const DataType dt_output = output->data_type();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1152

const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);

1153

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1154

switch(dt_input1)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1155

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1156

case DataType::QASYMM8:

1157

if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)

1158

{

1159

_func_quantized = &mul_saturate_quantized_8<uint8_t>;

1160

}

1161

break;

1162

case DataType::QASYMM8_SIGNED:

1163

if(dt_input2 == DataType::QASYMM8_SIGNED)

1164

{

1165

_func_quantized = &mul_saturate_quantized_8<int8_t>;

;

}

break;

case DataType::QSYMM16:

1170

if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)

1171

{

1172

_func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;

1173

}

1174

else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)

1175

{

1176

_func_int = &mul_QSYMM16_QSYMM16_S32;

}

break;

case DataType::S16:

if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;

1189

}

1190

}

1191

if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;

}

}

break;

case DataType::U8:

if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;

1213

}

1214

}

1215

else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;

1224

}

1225

}

1226

else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;

}

}

break;

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1239

case DataType::F16:

1240

_func_float = &mul_F16_F16_F16;

1241

break;

1242

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1243

case DataType::F32:

1244

_func_float = &mul_F32_F32_F32;

1245

break;

1246

default:

1247

ARM_COMPUTE_ERROR("You called with the wrong img formats");

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1248

}

1249

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1250

// Configure kernel window

1251

Coordinates coord;

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1252

coord.set_num_dimensions(output->num_dimensions());

1253

output->set_valid_region(valid_region);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1254

Window win = calculate_max_window(valid_region, Steps());

1255

1256

INEKernel::configure(win);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1257

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1258

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1259

Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,

1260

RoundingPolicy rounding_policy)

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1261

{

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1262

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1263

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1264

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1265

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1266

}

1267

Georgios Pinitas

0499dff

2020-07-31 22:21:38 +0100

[diff] [blame^]

1268

void NEPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1269

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1270

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1271

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1272

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1273

Georgios Pinitas

0499dff

2020-07-31 22:21:38 +0100

[diff] [blame^]

1274

auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

1275

auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

1276

auto output = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1277

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1278

if(_func_quantized != nullptr)

Michalis Spyrou

2018-02-26 16:47:58 +0000

[diff] [blame]

1279

{

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1280

(*_func_quantized)(input1, input2, output, window, _scale);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1281

}

1282

else if(_func_int != nullptr)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1283

{

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1284

(*_func_int)(input1, input2, output, window, _scale_exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1285

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1286

else

1287

{

1288

ARM_COMPUTE_ERROR_ON(_func_float == nullptr);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1289

(*_func_float)(input1, input2, output, window, _scale);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1290

}

1291

}

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1292

namespace

1293

{

1294

constexpr unsigned int num_elems_processed_per_iteration_complex = 2;

1295

1296

Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)

1297

{

1298

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);

1299

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);

1300

1301

const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());

1302

1303

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

1304

1305

// Validate in case of configured output

1306

if(output->total_size() > 0)

1307

{

1308

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);

1309

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");

}

return Status{};

}

std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)

1316

{

1317

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

1318

const TensorShape &out_shape = broadcast_pair.first;

1319

const ValidRegion &valid_region = broadcast_pair.second;

1320

1321

// Auto initialize output if not initialized

1322

const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());

1323

auto_init_if_empty(*output, out_info);

1324

1325

Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));

1326

Window win_input1 = win.broadcast_if_dimension_le_one(*input1);

1327

Window win_input2 = win.broadcast_if_dimension_le_one(*input2);

1328

1329

AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);

1330

AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);

1331

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);

1332

1333

bool window_changed = update_window_and_padding(win_input1, input1_access)

1334

|| update_window_and_padding(win_input2, input2_access)

1335

|| update_window_and_padding(win, output_access);

1336

1337

output_access.set_valid_region(win, valid_region);

1338

1339

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

1340

return std::make_pair(err, win);

}

} // namespace

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1344

void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1345

{

1346

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1347

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1348

1349

// Configure kernel window

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1350

auto win_config = validate_and_configure_window_complex(input1, input2, output);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1351

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

1352

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1353

// Create kernel

1354

INEKernel::configure(win_config.second);

1355

}

1356

1357

Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)

1358

{

1359

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

1360

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));

1361

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);

return Status{};

}

Georgios Pinitas

2020-07-31 22:21:38 +0100

[diff] [blame^]

1366

void NEComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1367

{

1368

ARM_COMPUTE_UNUSED(info);

1369

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1370

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1371

Georgios Pinitas

0499dff

2020-07-31 22:21:38 +0100

[diff] [blame^]

1372

auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

1373

auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

1374

auto output = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1375

1376

Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));

1377

Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));

1378

Iterator output_it(output, window);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1379

1380

execute_window_loop(window, [&](const Coordinates &)

1381

{

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1382

c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1383

},

Michalis Spyrou

2020-07-02 17:39:25 +0100

[diff] [blame]

1384

input1_it, input2_it, output_it);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1385

}

Manuel Bottini