Blame - src/cpu/kernels/CpuMulKernel.cpp - ml/ComputeLibrary

2017-11-28 18:29:43 +0000

[diff] [blame]

56

{

57

ARM_COMPUTE_UNUSED(overflow_policy);

58

ARM_COMPUTE_UNUSED(rounding_policy);

59

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

60

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);

61

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

62

DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

63

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

64

DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

65

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

66

DataType::S16, DataType::QSYMM16,

67

DataType::S32, DataType::F16, DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

68

if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

69

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

70

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

71

ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

72

}

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

73

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

74

if(dst->total_size() > 0)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

75

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

76

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

77

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

78

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

79

// clang-format off

80

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

81

!(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&

82

!(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

83

!(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&

84

!(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

85

!(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

86

!(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

87

, "Invalid data type combination");

88

// clang-format on

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

89

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

90

}

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

91

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

92

if(std::abs(scale - scale255_constant) < 0.00001f)

93

{

94

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

95

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,

96

"Scale == 1/255 is not supported if input and dst are of data type S32");

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);

101

102

int exponent = 0;

103

const float normalized_mantissa = std::frexp(scale, &exponent);

104

105

// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15

106

// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14

107

// Moreover, it will be negative as we deal with 1/2^n

108

ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");

109

}

110

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

111

return Status{};

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

112

}

113

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

114

/* Scales a given vector by 1/255.

115

*

116

* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.

117

*

118

* @param in Input vector to scale.

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

119

* @return Scaled dst rounded to nearest (round half up).

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

120

*/

121

inline int32x4_t scale255_S32_S32(int32x4_t in)

122

{

123

// Scale

124

const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);

125

// Round to nearest (round half up)

126

// Add +0.5 for all values

127

// Afterwards vcvt rounds toward zero

128

return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));

129

}

130

131

inline uint16x8_t scale255_U16_U16(uint16x8_t in)

132

{

133

const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));

134

const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));

135

return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));

136

}

137

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

138

template <typename T>

139

inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type

140

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

141

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

142

return vquantize_signed(val, info);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

143

}

144

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

145

template <typename T>

146

inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type

147

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

148

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

149

return vquantize(val, info);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

150

}

151

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

152

template <typename T>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

153

void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

154

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

155

// Create input windows

156

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

157

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

158

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

159

160

// Clear X Dimension on execution window as we handle manually

161

win.set(Window::DimX, Window::Dimension(0, 1, 1));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

162

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

163

const int window_step_x = 16 / sizeof(T);

164

const auto window_start_x = static_cast<int>(window.x().start());

165

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

166

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

167

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

168

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

169

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

170

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

171

if(is_broadcast_across_x)

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

172

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

173

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

174

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

175

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

176

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

177

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

178

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

179

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

180

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

181

// Clear X Dimension on execution window as we handle manually

182

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

183

184

Iterator broadcast_input(broadcast_tensor, broadcast_win);

185

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

186

Iterator dst(out, win);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

187

188

using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;

189

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

190

execute_window_loop(

191

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

192

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

193

const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

194

const auto output_ptr = reinterpret_cast<T *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

195

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

196

const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());

197

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

198

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

199

// Compute window_step_x elements per iteration

200

int x = window_start_x;

201

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

202

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

203

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

204

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

205

// Dequantize inputs

206

const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);

207

const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

208

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

209

const float32x4x4_t out_f32x4x4 =

210

{

211

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

212

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

213

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

214

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

215

};

216

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

217

// Quantize dst

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

218

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

219

wrapper::vstore(output_ptr + x, result);

220

}

221

222

// Compute left-over elements

223

for(; x < window_end_x; ++x)

224

{

225

// Dequantize inputs

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

226

const T src1 = *(non_broadcast_input_ptr + x);

227

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

228

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);

229

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

230

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

231

// Quantize dst

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

232

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

233

*(output_ptr + x) = tmp_qua;

234

}

235

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

236

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

237

}

238

else

239

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

240

const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();

241

const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

242

243

// Clear X Dimension on execution window as we handle manually

244

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

245

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

246

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

247

Iterator input1(src1, input1_win);

248

Iterator input2(src2, input2_win);

249

Iterator dst(out, win);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

250

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

251

execute_window_loop(

252

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

253

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

254

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

255

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

256

const auto output_ptr = reinterpret_cast<T *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

257

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

258

// Compute window_step_x elements per iteration

259

int x = window_start_x;

260

for(; x <= (window_end_x - window_step_x); x += window_step_x)

261

{

262

const auto input1_q = wrapper::vloadq(input1_ptr + x);

263

const auto input2_q = wrapper::vloadq(input2_ptr + x);

264

265

// Dequantize inputs

266

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

267

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

268

269

const float32x4x4_t out_f32x4x4 =

270

{

271

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

272

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

273

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

274

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

275

};

276

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

277

// Quantize dst

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

278

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

279

wrapper::vstore(output_ptr + x, result);

280

}

281

282

// Compute left-over elements

283

for(; x < window_end_x; ++x)

284

{

285

// Dequantize inputs

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

286

const T src1 = *(input1_ptr + x);

287

const T src2 = *(input2_ptr + x);

288

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);

289

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

290

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

291

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

292

// Quantize dst

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

293

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

294

*(output_ptr + x) = tmp_qua;

295

}

296

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

297

input1, input2, dst);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

298

}

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

299

}

300

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

301

bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)

302

{

303

const auto iq0 = src0->quantization_info().uniform();

304

const auto iq1 = src1->quantization_info().uniform();

305

const auto oq = dst->quantization_info().uniform();

306

307

const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;

308

309

if(multiplier < -8191.f || multiplier > 8191.f)

310

{

311

//The multiplier cannot be stored as a 14.18 signed fixed-point number

return false;

}

const auto offset_out = float(oq.offset);

316

317

const auto max_result = multiplier * (256) * (256) + offset_out;

318

319

if(max_result > 8191.f)

320

{

321

//It might not be possible to store the result as a 14.18 signed fixed-point number.

return false;

}

return true;

}

template <typename ScalarType>

329

void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)

330

{

331

const auto in0_info = src0->info();

332

const auto in1_info = src1->info();

333

334

const auto &in0_shape = in0_info->tensor_shape();

335

const auto &in1_shape = in1_info->tensor_shape();

336

337

// Create input windows.

338

Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);

339

Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);

340

341

// Clear the x dimension on the execution window as we process the whole row each iteration.

342

Window win = window;

343

win.set(Window::DimX, Window::Dimension(0, 1, 1));

344

345

constexpr int window_step_x = 16;

346

const auto window_start_x = window.x().start();

347

const auto window_end_x = window.x().end();

348

const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();

349

350

const auto iq0_info = in0_info->quantization_info().uniform();

351

const auto iq1_info = in1_info->quantization_info().uniform();

352

const auto oq_info = dst->info()->quantization_info().uniform();

353

354

const auto in0_offset = iq0_info.offset;

355

const auto in1_offset = iq1_info.offset;

356

const auto out_offset = oq_info.offset;

357

const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;

358

359

constexpr int32_t two_pwr18i = 262144;

360

constexpr float two_pwr18f = 262144.f;

361

362

const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);

363

const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);

364

const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);

365

const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);

366

367

if(is_broadcast_across_x)

368

{

369

// Prefix: a = non-broadcast, b = broadcast.

370

371

const auto is_broadcast_input_1 = in1_win.x().step() == 0;

372

auto a_win = is_broadcast_input_1 ? in0_win : in1_win;

373

auto b_win = is_broadcast_input_1 ? in1_win : in0_win;

374

const auto a_tensor = is_broadcast_input_1 ? src0 : src1;

375

const auto b_tensor = is_broadcast_input_1 ? src1 : src0;

376

377

const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;

378

const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;

379

#ifndef __aarch64__

380

const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;

381

const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;

382

#endif //__aarch64__

383

const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());

384

385

// Clear the x dimension on the execution window as we process the whole row each iteration.

386

a_win.set(Window::DimX, Window::Dimension(0, 1, 1));

387

388

Iterator a_input_it(a_tensor, a_win);

389

Iterator b_input_it(b_tensor, b_win);

390

Iterator out_it(dst, win);

391

392

execute_window_loop(

393

win, [&](const Coordinates &)

394

{

395

const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());

396

const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());

397

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

398

399

const auto b_val = *b_ptr;

400

const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);

401

const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());

402

403

const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());

404

const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());

405

406

int x = window_start_x;

407

408

for(; x <= (window_end_x - window_step_x); x += window_step_x)

409

{

410

// Load the inputs.

411

const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);

412

413

// Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.

414

const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));

415

const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));

416

417

const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);

418

const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);

419

const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);

420

const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);

421

422

const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);

423

const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);

424

const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);

425

const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);

426

427

const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);

428

const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);

429

const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);

430

const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);

431

432

// These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.

433

const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));

434

const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));

435

const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));

436

const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));

437

438

const auto vout_15p1_0 = wrapper::vcombine(

vout_15p1_00,

vout_15p1_01);

const auto vout_15p1_1 = wrapper::vcombine(

443

vout_15p1_10,

444

vout_15p1_11);

445

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

446

447

const auto vout_8p0 = wrapper::vcombine(

448

wrapper::vqrshrn<2>(vout_15p1_0),

449

wrapper::vqrshrn<2>(vout_15p1_1));

450

wrapper::vstore(out_ptr + x, vout_8p0);

451

}

452

453

//Process the left-over elements.

454

for(; x < window_end_x; ++x)

455

{

456

#ifdef __aarch64__

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

457

out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

458

b_val) - b_offset_16p0)) + out_offset_14p18)));

459

#else //__aarch64__

460

out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));

#endif //__aarch64__

}

},

a_input_it, b_input_it, out_it);

}

else

{

const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());

469

const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());

470

const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());

471

const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());

472

473

// Clear the x dimension on the execution window as we process the whole row each iteration.

474

in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));

475

in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

476

477

Iterator in0_it(src0, in0_win);

478

Iterator in1_it(src1, in1_win);

479

Iterator out_it(dst, win);

480

481

execute_window_loop(

482

win, [&](const Coordinates &)

483

{

484

const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());

485

const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());

486

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

487

488

int x = window_start_x;

489

490

for(; x <= (window_end_x - window_step_x); x += window_step_x)

491

{

492

// Load the inputs.

493

const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);

494

const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);

495

496

// Widen the input elements to signed 16-bit regardless of the input signedness.

497

const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));

498

const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));

499

const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));

500

const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));

501

502

const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);

503

const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);

504

const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);

505

const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);

506

507

const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);

508

const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);

509

const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);

510

const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);

511

512

const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);

513

const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);

514

const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);

515

const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);

516

517

const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);

518

const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);

519

const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);

520

const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);

521

522

// These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.

523

const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));

524

const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));

525

const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));

526

const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));

527

528

const auto vout_14p2_0 = wrapper::vcombine(

vout_14p2_00,

vout_14p2_01);

const auto vout_14p2_1 = wrapper::vcombine(

vout_14p2_10,

vout_14p2_11);

const auto vout_8p0 = wrapper::vcombine(

537

wrapper::vqrshrn<2>(vout_14p2_0),

538

wrapper::vqrshrn<2>(vout_14p2_1));

539

wrapper::vstore(out_ptr + x, vout_8p0);

540

}

541

542

//Process the left-over elements.

543

for(; x < window_end_x; ++x)

544

{

545

#ifdef __aarch64__

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

546

out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

547

in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));

548

#else //__aarch64__

549

out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));

#endif //__aarch64__

}

},

in0_it, in1_it, out_it);

}

}

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

557

void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

558

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

559

const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();

560

const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

561

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

562

563

// Create input windows

564

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

565

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

566

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

567

568

// Clear X Dimension on execution window as we handle manually

569

win.set(Window::DimX, Window::Dimension(0, 1, 1));

570

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

571

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

572

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

573

Iterator input1(src1, input1_win);

574

Iterator input2(src2, input2_win);

575

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

576

577

const int window_step_x = 16;

578

const auto window_start_x = static_cast<int>(window.x().start());

579

const auto window_end_x = static_cast<int>(window.x().end());

580

581

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

582

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

583

execute_window_loop(

584

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

585

{

586

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

587

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

588

const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

589

590

// Compute window_step_x elements per iteration

591

int x = window_start_x;

592

for(; x <= (window_end_x - window_step_x); x += window_step_x)

593

{

594

const qsymm16x8x2_t input1_q =

595

{

596

{

597

vld1q_s16(input1_ptr + x),

598

vld1q_s16(input1_ptr + x + 8),

599

}

600

};

601

const qsymm16x8x2_t input2_q =

602

{

603

{

604

vld1q_s16(input2_ptr + x),

605

vld1q_s16(input2_ptr + x + 8),

}

};

// Dequantize inputs

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

611

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

612

613

const float32x4x4_t out_f32x4x4 =

614

{

615

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

616

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

617

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

618

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

619

};

620

621

const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);

622

vst1q_s16(output_ptr + x, result.val[0]);

623

vst1q_s16(output_ptr + x + 8, result.val[1]);

624

}

625

626

// Compute left-over elements

627

for(; x < window_end_x; ++x)

628

{

629

// Dequantize inputs

630

float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;

631

float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;

632

float tmp_f = tmp_in1 * tmp_in2;

633

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

634

// Quantize dst, lrintf() has same rounding mode as vcombine_s16

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

635

int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);

636

qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

637

*(output_ptr + x) = tmp_qua;

638

}

639

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

640

input1, input2, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

641

}

642

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

643

void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

644

{

645

ARM_COMPUTE_UNUSED(scale);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

646

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

647

// Create input windows

648

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

649

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

650

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

651

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

652

// Clear X Dimension on execution window as we handle manually

653

win.set(Window::DimX, Window::Dimension(0, 1, 1));

654

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

655

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

656

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

657

Iterator input1(src1, input1_win);

658

Iterator input2(src2, input2_win);

659

Iterator dst(out, win);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

660

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

661

const int window_step_x = 16;

662

const auto window_start_x = static_cast<int>(window.x().start());

663

const auto window_end_x = static_cast<int>(window.x().end());

664

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

665

execute_window_loop(

666

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

667

{

668

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

669

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

670

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

671

672

// Compute window_step_x elements per iteration

673

int x = window_start_x;

674

for(; x <= (window_end_x - window_step_x); x += window_step_x)

675

{

676

const qsymm16x8x2_t input1_q =

677

{

678

{

679

vld1q_s16(input1_ptr + x),

680

vld1q_s16(input1_ptr + x + 8),

681

}

682

};

683

const qsymm16x8x2_t input2_q =

684

{

685

{

686

vld1q_s16(input2_ptr + x),

687

vld1q_s16(input2_ptr + x + 8),

}

};

const int32x4x4_t in1_s32 =

692

{

693

{

694

vmovl_s16(vget_low_s16(input1_q.val[0])),

695

vmovl_s16(vget_high_s16(input1_q.val[0])),

696

vmovl_s16(vget_low_s16(input1_q.val[1])),

697

vmovl_s16(vget_high_s16(input1_q.val[1])),

698

}

699

};

700

const int32x4x4_t in2_s32 =

701

{

702

{

703

vmovl_s16(vget_low_s16(input2_q.val[0])),

704

vmovl_s16(vget_high_s16(input2_q.val[0])),

705

vmovl_s16(vget_low_s16(input2_q.val[1])),

706

vmovl_s16(vget_high_s16(input2_q.val[1])),

}

};

const int32x4x4_t result =

711

{

712

{

713

vmulq_s32(in1_s32.val[0], in2_s32.val[0]),

714

vmulq_s32(in1_s32.val[1], in2_s32.val[1]),

715

vmulq_s32(in1_s32.val[2], in2_s32.val[2]),

716

vmulq_s32(in1_s32.val[3], in2_s32.val[3]),

}

};

vst1q_s32(output_ptr + x, result.val[0]);

721

vst1q_s32(output_ptr + x + 4, result.val[1]);

722

vst1q_s32(output_ptr + x + 8, result.val[2]);

723

vst1q_s32(output_ptr + x + 12, result.val[3]);

724

}

725

726

// Compute left-over elements

727

for(; x < window_end_x; ++x)

728

{

729

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

730

*(output_ptr + x) = tmp;

731

}

732

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

733

input1, input2, dst);

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

734

}

735

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

736

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

737

void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

738

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

739

// Create input windows

740

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

741

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

742

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

743

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

744

// Clear X Dimension on execution window as we handle manually

745

win.set(Window::DimX, Window::Dimension(0, 1, 1));

746

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

747

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

748

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

749

Iterator input1(src1, input1_win);

750

Iterator input2(src2, input2_win);

751

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

752

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

753

const int window_step_x = 16 / sizeof(uint8_t);

754

const auto window_start_x = static_cast<int>(window.x().start());

755

const auto window_end_x = static_cast<int>(window.x().end());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

756

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

757

execute_window_loop(

758

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

759

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

760

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

761

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

762

const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

763

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

764

// Compute window_step_x elements per iteration

765

int x = window_start_x;

766

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

767

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

768

const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);

769

const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

770

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

771

uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));

772

const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));

773

uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));

774

const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));

775

776

tmp1_high = vmulq_u16(tmp1_high, tmp2_high);

777

tmp1_low = vmulq_u16(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_U16_U16(tmp1_high);

782

tmp1_low = scale255_U16_U16(tmp1_low);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp1_high = vqshlq_u16(tmp1_high, vn);

791

tmp1_low = vqshlq_u16(tmp1_low, vn);

}

else

{

tmp1_high = vshlq_u16(tmp1_high, vn);

796

tmp1_low = vshlq_u16(tmp1_low, vn);

}

}

if(is_sat)

{

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

801

vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

802

}

803

else

804

{

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

805

vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

}

}

// Compute left-over elements

810

for(; x < window_end_x; ++x)

811

{

812

uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

817

tmp = static_cast<uint16_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat && tmp > 255)

{

tmp = 255;

}

*(output_ptr + x) = static_cast<uint8_t>(tmp);

828

}

829

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

830

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

831

}

832

833

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

834

inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

835

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

836

int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));

837

const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));

838

int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));

839

const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

840

841

tmp1_high = vmulq_s32(tmp1_high, tmp2_high);

842

tmp1_low = vmulq_s32(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_S32_S32(tmp1_high);

847

tmp1_low = scale255_S32_S32(tmp1_low);

}

else

{

// Right shift amount

852

const int32x4_t vn = vdupq_n_s32(-n);

853

// Left shift amount

854

const int32x4_t vnl = vdupq_n_s32(n);

855

// Calculate conversion bit

856

const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);

857

const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);

858

const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);

859

const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);

860

const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);

861

const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);

862

const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);

863

const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);

864

if(is_sat)

865

{

866

tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

867

tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

else

{

tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

872

tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

}

if(is_sat)

{

return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));

}

else

{

return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));

}

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

887

inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

888

{

889

const int16x8x2_t result =

890

{

891

{

892

// First 8 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

893

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

894

// Second 8 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

895

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

return result;

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

903

void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

904

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

905

// Create input windows

906

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

907

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

908

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

909

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

910

// Clear X Dimension on execution window as we handle manually

911

win.set(Window::DimX, Window::Dimension(0, 1, 1));

912

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

913

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

914

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

915

Iterator input1(src1, input1_win);

916

Iterator input2(src2, input2_win);

917

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

918

919

const int window_step_x = 16;

920

const auto window_start_x = static_cast<int>(window.x().start());

921

const auto window_end_x = static_cast<int>(window.x().end());

922

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

923

execute_window_loop(

924

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

925

{

926

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

927

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

928

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

929

930

// Compute window_step_x elements per iteration

931

int x = window_start_x;

932

for(; x <= (window_end_x - window_step_x); x += window_step_x)

933

{

934

const int16x8x2_t ta1 =

935

{

936

{

937

vld1q_s16(input1_ptr + x),

938

vld1q_s16(input1_ptr + x + 8),

939

}

940

};

941

const int16x8x2_t ta2 =

942

{

943

{

944

vld1q_s16(input2_ptr + x),

945

vld1q_s16(input2_ptr + x + 8),

946

}

947

};

948

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

949

950

vst1q_s16(output_ptr + x, result.val[0]);

951

vst1q_s16(output_ptr + x + 8, result.val[1]);

952

}

953

954

// Compute left-over elements

955

for(; x < window_end_x; ++x)

956

{

957

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

962

963

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

974

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

980

}

981

*(output_ptr + x) = static_cast<int16_t>(tmp);

982

}

983

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

984

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

985

}

986

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

987

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

988

inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

989

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

990

const int32x2_t input1_1 = vget_low_s32(src1);

991

const int32x2_t input2_1 = vget_low_s32(src2);

992

const int32x2_t input1_2 = vget_high_s32(src1);

993

const int32x2_t input2_2 = vget_high_s32(src2);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

994

995

int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);

996

int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);

997

998

// Apply scaling, conversion and rounding (round to zero)

999

// Right shift amount

1000

const int64x2_t vn = vdupq_n_s64(-n);

1001

// Left shift amount

1002

const int64x2_t vnl = vdupq_n_s64(n);

1003

// Calculate conversion bit

1004

const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);

1005

const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);

1006

const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);

1007

const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);

1008

1009

const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);

1010

const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);

1011

const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);

1012

const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);

1013

if(is_sat)

1014

{

1015

tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);

1016

tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);

1017

return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));

}

else

{

tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);

1022

tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);

1023

return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));

}

}

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1027

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1028

inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1029

{

1030

const int32x4x2_t result =

1031

{

1032

{

1033

// First 4 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1034

mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1035

// Second 4 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1036

mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

}

};

return result;

}

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1044

void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1045

{

1046

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1047

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1048

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1049

1050

// Clear X Dimension on execution window as we handle manually

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1051

Window win = window;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1052

win.set(Window::DimX, Window::Dimension(0, 1, 1));

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1053

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1054

const int window_step_x = 8;

1055

const auto window_start_x = static_cast<int>(window.x().start());

1056

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1057

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1058

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1059

if(is_broadcast_across_x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1060

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1061

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1062

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1063

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1064

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1065

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1066

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1067

// Clear X Dimension on execution window as we handle manually

1068

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1069

1070

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1071

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1072

Iterator dst(out, win);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1073

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1074

execute_window_loop(

1075

win, [&](const Coordinates &)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1076

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1077

const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1078

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1079

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1080

const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());

1081

const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1082

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1083

// Compute window_step_x elements per iteration

1084

int x = window_start_x;

1085

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1086

{

1087

const int32x4x2_t broadcast_v =

{

{

broadcast_value_vec,

broadcast_value_vec,

}

};

const int32x4x2_t non_broadcast_v =

1095

{

1096

{

1097

vld1q_s32(non_broadcast_input_ptr + x),

1098

vld1q_s32(non_broadcast_input_ptr + x + 4),

1099

}

1100

};

1101

const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);

1102

1103

vst1q_s32(output_ptr + x, result.val[0]);

1104

vst1q_s32(output_ptr + x + 4, result.val[1]);

1105

}

1106

1107

// Compute left-over elements

1108

for(; x < window_end_x; ++x)

1109

{

1110

int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));

if(tmp >= 0)

{

tmp >>= n;

}

else

{

Michalis Spyrou

7f8caf7

2021-05-13 13:35:30 +0100

[diff] [blame]

1118

uint64_t mask = ((uint64_t)1u << n) - 1;

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1119

tmp = (tmp + static_cast<int64_t>(mask)) >> n;

}

if(is_sat)

{

tmp = utility::clamp<int64_t, int32_t>(tmp);

1124

}

1125

*(output_ptr + x) = static_cast<int32_t>(tmp);

1126

}

1127

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1128

broadcast_input, non_broadcast_input, dst);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1133

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1134

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1135

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1136

Iterator input1(src1, input1_win);

1137

Iterator input2(src2, input2_win);

1138

Iterator dst(out, win);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1139

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1140

execute_window_loop(

1141

win, [&](const Coordinates &)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1142

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1143

const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());

1144

const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1145

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1146

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1147

// Compute window_step_x elements per iteration

1148

int x = window_start_x;

1149

for(; x <= (window_end_x - window_step_x); x += window_step_x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1150

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1151

const int32x4x2_t ta1 =

1152

{

1153

{

1154

vld1q_s32(input1_ptr + x),

1155

vld1q_s32(input1_ptr + x + 4),

1156

}

1157

};

1158

const int32x4x2_t ta2 =

1159

{

1160

{

1161

vld1q_s32(input2_ptr + x),

1162

vld1q_s32(input2_ptr + x + 4),

1163

}

1164

};

1165

const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);

1166

1167

vst1q_s32(output_ptr + x, result.val[0]);

1168

vst1q_s32(output_ptr + x + 4, result.val[1]);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1169

}

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1170

1171

// Compute left-over elements

1172

for(; x < window_end_x; ++x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1173

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1174

int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));

if(tmp >= 0)

{

tmp >>= n;

}

else

{

Michalis Spyrou

7f8caf7

2021-05-13 13:35:30 +0100

[diff] [blame]

1182

uint64_t mask = ((uint64_t)1u << n) - 1;

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1183

tmp = (tmp + static_cast<int64_t>(mask)) >> n;

}

if(is_sat)

{

tmp = utility::clamp<int64_t, int32_t>(tmp);

1188

}

1189

*(output_ptr + x) = static_cast<int32_t>(tmp);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1190

}

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1191

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1192

input1, input2, dst);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1193

}

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1194

}

1195

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1196

void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1197

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1198

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1199

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1200

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1201

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1202

// Clear X Dimension on execution window as we handle manually

1203

Window win = window;

1204

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1205

1206

constexpr int window_step_x = 16 / sizeof(float);

1207

const auto window_start_x = static_cast<int>(window.x().start());

1208

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1209

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1210

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1211

using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;

1212

1213

if(is_broadcast_across_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1214

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1215

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1216

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1217

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1218

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1219

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1220

1221

// Clear X Dimension on execution window as we handle manually

1222

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1223

1224

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1225

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1226

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1227

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1228

execute_window_loop(

1229

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1230

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1231

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1232

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1233

1234

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

1235

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

1236

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

1237

1238

// Compute window_step_x elements per iteration

1239

int x = window_start_x;

1240

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1241

{

1242

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

1243

auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);

1244

wrapper::vstore(output_ptr + x, res);

1245

}

1246

1247

// Compute left-over elements

1248

for(; x < window_end_x; ++x)

1249

{

1250

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

1251

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

1252

}

1253

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1254

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1259

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1260

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1261

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1262

Iterator input1(src1, input1_win);

1263

Iterator input2(src2, input2_win);

1264

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1265

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1266

execute_window_loop(

1267

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1268

{

1269

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

1270

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1271

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1272

1273

// Compute window_step_x elements per iteration

1274

int x = window_start_x;

1275

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1276

{

1277

const auto ta1 = wrapper::vloadq(input1_ptr + x);

1278

const auto ta2 = wrapper::vloadq(input2_ptr + x);

1279

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

1280

const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);

1281

wrapper::vstore(output_ptr + x, res);

1282

}

1283

1284

// Compute left-over elements

1285

for(; x < window_end_x; ++x)

1286

{

1287

const auto ta1 = *(input1_ptr + x);

1288

const auto ta2 = *(input2_ptr + x);

1289

*(output_ptr + x) = ta1 * ta2 * scale;

1290

}

1291

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1292

input1, input2, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1293

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1294

}

1295

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1296

void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1297

{

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1298

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1299

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1300

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1301

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1302

// Clear X Dimension on execution window as we handle manually

1303

Window win = window;

1304

win.set(Window::DimX, Window::Dimension(0, 1, 1));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1305

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1306

constexpr int window_step_x = 8 / sizeof(float);

1307

const auto window_start_x = static_cast<int>(window.x().start());

1308

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1309

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1310

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1311

using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;

1312

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1313

if(is_broadcast_across_x)

1314

{

1315

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1316

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1317

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1318

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1319

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1320

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1321

// Clear X Dimension on execution window as we handle manually

1322

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1323

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1324

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1325

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1326

Iterator dst(out, win);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1327

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1328

execute_window_loop(

1329

win, [&](const Coordinates &)

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1330

{

1331

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1332

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1333

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1334

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

1335

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1336

// Compute window_step_x elements per iteration

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1337

int x = window_start_x;

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1338

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1339

{

1340

const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);

1341

float32x4_t b = vdupq_n_f32(broadcast_value);

1342

1343

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

1344

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

1345

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

1346

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

1347

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

1348

1349

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

1350

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

1351

1352

float32x4_t res = wrapper::vmul(tmp0, b);

1353

b = wrapper::vmul(b, mask);

1354

1355

res = wrapper::vmla(res, tmp1, b);

1356

wrapper::vstore(output_ptr + 2 * x, res);

1357

}

1358

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1359

// Compute left-over elements

1360

for(; x < window_end_x; ++x)

1361

{

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1362

const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);

1363

const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);

1364

auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);

1365

auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);

1366

*(output_ptr + 2 * x) = res1;

1367

*(output_ptr + 2 * x + 1) = res2;

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1368

}

1369

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1370

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1375

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1376

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1377

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1378

Iterator input1(src1, input1_win);

1379

Iterator input2(src2, input2_win);

1380

Iterator dst(out, win);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1381

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1382

execute_window_loop(

1383

win, [&](const Coordinates &)

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1384

{

1385

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

1386

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1387

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1388

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1389

// Compute window_step_x elements per iteration

1390

int x = window_start_x;

1391

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1392

{

1393

const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);

1394

float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);

1395

1396

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

1397

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

1398

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

1399

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

1400

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

1401

1402

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

1403

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

1404

1405

float32x4_t res = wrapper::vmul(tmp0, b);

1406

1407

b = wrapper::vrev64(b);

1408

b = wrapper::vmul(b, mask);

1409

1410

res = wrapper::vmla(res, tmp1, b);

1411

wrapper::vstore(output_ptr + 2 * x, res);

1412

}

1413

1414

// Compute left-over elements

1415

for(; x < window_end_x; ++x)

1416

{

1417

const auto a0 = *(input1_ptr + 2 * x);

1418

const auto a1 = *(input1_ptr + 2 * x + 1);

1419

const auto b0 = *(input2_ptr + 2 * x);

1420

const auto b1 = *(input2_ptr + 2 * x + 1);

1421

auto res1 = a0 * b0 - a1 * b1;

1422

auto res2 = a0 * b1 + a1 * b0;

1423

*(output_ptr + 2 * x) = res1;

1424

*(output_ptr + 2 * x + 1) = res2;

1425

}

1426

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1427

input1, input2, dst);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1428

}

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1429

}

1430

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1431

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1432

void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1433

{

1434

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1435

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1436

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1437

1438

// Clear X Dimension on execution window as we handle manually

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1439

Window win = window;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1440

win.set(Window::DimX, Window::Dimension(0, 1, 1));

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1441

constexpr int window_step_x = 16;

1442

const auto window_start_x = static_cast<int>(window.x().start());

1443

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1444

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1445

if(is_broadcast_across_x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1446

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1447

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1448

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1449

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1450

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1451

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1452

// Clear X Dimension on execution window as we handle manually

1453

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1454

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1455

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1456

Iterator dst(out, win);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1457

execute_window_loop(

1458

win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1459

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1460

const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1461

const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1462

const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1463

const float16x8x2_t broadcast_value_vec =

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1464

{

1465

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1466

vdupq_n_f16(broadcast_value),

1467

vdupq_n_f16(broadcast_value),

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1468

}

1469

};

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1470

const auto scale_vec = vdupq_n_f16(scale);

1471

// Compute window_step_x elements per iteration

1472

int x = window_start_x;

1473

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1474

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1475

const float16x8x2_t non_broadcast_v =

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1476

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1477

{

1478

vld1q_f16(non_broadcast_input_ptr + x),

1479

vld1q_f16(non_broadcast_input_ptr + x + 8),

1480

}

1481

};

1482

const float16x8x2_t result =

1483

{

1484

{

1485

vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),

1486

vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),

1487

}

1488

};

1489

vst1q_f16(output_ptr + x, result.val[0]);

1490

vst1q_f16(output_ptr + x + 8, result.val[1]);

1491

}

1492

// Compute left-over elements

1493

for(; x < window_end_x; ++x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1494

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1495

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

1496

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

1497

}

1498

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1499

broadcast_input, non_broadcast_input, dst);

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

}

else

{

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1504

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1505

Iterator input1(src1, input1_win);

1506

Iterator input2(src2, input2_win);

1507

Iterator dst(out, win);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1508

execute_window_loop(

1509

win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1510

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1511

const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());

1512

const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1513

const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1514

// Compute window_step_x elements per iteration

1515

int x = window_start_x;

1516

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1517

{

1518

const float16x8x2_t ta1 =

1519

{

1520

{

1521

vld1q_f16(input1_ptr + x),

1522

vld1q_f16(input1_ptr + x + 8),

1523

}

1524

};

1525

const float16x8x2_t ta2 =

1526

{

1527

{

1528

vld1q_f16(input2_ptr + x),

1529

vld1q_f16(input2_ptr + x + 8),

1530

}

1531

};

1532

const float16x8_t scale_vec = vdupq_n_f16(scale);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1533

const float16x8x2_t result =

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1534

{

1535

{

1536

vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),

1537

vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),

1538

}

1539

};

1540

vst1q_f16(output_ptr + x, result.val[0]);

1541

vst1q_f16(output_ptr + x + 8, result.val[1]);

1542

}

1543

// Compute left-over elements

1544

for(; x < window_end_x; ++x)

1545

{

1546

const auto ta1 = *(input1_ptr + x);

1547

const auto ta2 = *(input2_ptr + x);

1548

*(output_ptr + x) = ta1 * ta2 * scale;

1549

}

1550

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1551

input1, input2, dst);

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1552

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1553

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1554

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

df24618

2017-07-03 16:25:09 +0100

[diff] [blame]

1555

1556

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1557

void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1558

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1559

// Create input windows

1560

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1561

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1562

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1563

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1564

// Clear X Dimension on execution window as we handle manually

1565

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1566

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1567

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1568

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1569

Iterator input1(src1, input1_win);

1570

Iterator input2(src2, input2_win);

1571

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1572

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1573

const int window_step_x = 16 / sizeof(uint8_t);

1574

const auto window_start_x = static_cast<int>(window.x().start());

1575

const auto window_end_x = static_cast<int>(window.x().end());

1576

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1577

execute_window_loop(

1578

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1579

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1580

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

1581

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1582

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1583

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1584

// Compute window_step_x elements per iteration

1585

int x = window_start_x;

1586

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1587

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1588

const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);

1589

const uint8x16_t av = wrapper::vloadq(input1_ptr + x);

1590

1591

uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));

1592

uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));

1593

tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));

1594

tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));

if(is_scale255)

{

tmp_low = scale255_U16_U16(tmp_low);

1599

tmp_high = scale255_U16_U16(tmp_high);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp_low = vqshlq_u16(tmp_low, vn);

1608

tmp_high = vqshlq_u16(tmp_high, vn);

}

else

{

tmp_low = vshlq_u16(tmp_low, vn);

1613

tmp_high = vshlq_u16(tmp_high, vn);

}

}

if(is_sat)

{

static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);

1620

1621

tmp_low = vminq_u16(tmp_low, max);

1622

tmp_high = vminq_u16(tmp_high, max);

1623

}

1624

1625

vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));

1626

vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1627

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1628

1629

// Compute left-over elements

1630

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1631

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1632

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1637

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;

1647

}

1648

1649

*(output_ptr + x) = static_cast<int16_t>(tmp);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1650

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1651

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1652

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1653

}

1654

1655

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1656

void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1657

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1658

// Create input windows

1659

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1660

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1661

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1662

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1663

// Clear X Dimension on execution window as we handle manually

1664

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1665

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1666

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1667

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1668

Iterator input1(src1, input1_win);

1669

Iterator input2(src2, input2_win);

1670

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1671

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1672

const int window_step_x = 16;

1673

const auto window_start_x = static_cast<int>(window.x().start());

1674

const auto window_end_x = static_cast<int>(window.x().end());

1675

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1676

execute_window_loop(

1677

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1678

{

1679

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

1680

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1681

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1682

1683

// Compute window_step_x elements per iteration

1684

int x = window_start_x;

1685

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1686

{

1687

const int16x8x2_t ta1 =

1688

{

1689

{

1690

vld1q_s16(input1_ptr + x),

1691

vld1q_s16(input1_ptr + x + 8),

1692

}

1693

};

1694

const uint8x8x2_t ta2u =

1695

{

1696

{

1697

vld1_u8(input2_ptr + x),

1698

vld1_u8(input2_ptr + x + 8),

1699

}

1700

};

1701

const int16x8x2_t ta2 =

1702

{

1703

{

1704

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),

1705

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))

}

};

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

1710

1711

vst1q_s16(output_ptr + x, result.val[0]);

1712

vst1q_s16(output_ptr + x + 8, result.val[1]);

1713

}

1714

1715

// Compute left-over elements

1716

for(; x < window_end_x; ++x)

1717

{

1718

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1723

1724

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

1735

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

1741

}

1742

*(output_ptr + x) = static_cast<int16_t>(tmp);

1743

}

1744

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1745

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1746

}

1747

1748

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1749

void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1750

{

1751

// Simply swap the two input buffers

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1752

mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

} // namespace

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1756

void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1757

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1758

ARM_COMPUTE_UNUSED(rounding_policy);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1759

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

Georgios Pinitas

f0dea70

2017-07-03 18:17:28 +0100

[diff] [blame]

1760

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1761

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1762

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1763

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

1764

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1765

// Auto initialize dst if not initialized

1766

set_shape_if_empty(*dst, out_shape);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1767

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1768

_scale = scale;

1769

_scale_exponent = 0;

1770

_func_quantized = nullptr;

1771

_func_int = nullptr;

1772

_func_float = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1773

1774

bool is_scale_255 = false;

1775

// Check and validate scaling factor

1776

if(std::abs(scale - scale255_constant) < 0.00001f)

1777

{

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

is_scale_255 = true;

}

else

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1782

int exponent = 0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1783

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1784

std::frexp(scale, &exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1785

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1786

// Store the positive exponent. We know that we compute 1/2^n

1787

// Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5

1788

_scale_exponent = std::abs(exponent - 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1789

}

1790

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1791

const DataType dt_input1 = src1->data_type();

1792

const DataType dt_input2 = src2->data_type();

1793

const DataType dt_output = dst->data_type();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1794

const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);

1795

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1796

switch(dt_input1)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1797

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1798

case DataType::QASYMM8:

1799

if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)

1800

{

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1801

if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))

1802

{

1803

_func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;

}

else

{

_func_quantized = &mul_saturate_quantized_8<uint8_t>;

1808

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1809

}

1810

break;

1811

case DataType::QASYMM8_SIGNED:

1812

if(dt_input2 == DataType::QASYMM8_SIGNED)

1813

{

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1814

if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))

1815

{

1816

_func_quantized = &mul_q8_neon_fixedpoint<int8_t>;

}

else

{

_func_quantized = &mul_saturate_quantized_8<int8_t>;

1821

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1822

}

1823

break;

1824

case DataType::QSYMM16:

1825

if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)

1826

{

1827

_func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;

1828

}

1829

else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)

1830

{

1831

_func_int = &mul_QSYMM16_QSYMM16_S32;

}

break;

case DataType::S16:

if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;

1844

}

1845

}

1846

if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;

1855

}

1856

}

1857

break;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1858

case DataType::S32:

1859

if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)

1860

{

1861

_func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;

1862

}

1863

break;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1864

case DataType::U8:

1865

if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;

1874

}

1875

}

1876

else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;

1885

}

1886

}

1887

else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;

}

}

break;

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1900

case DataType::F16:

1901

_func_float = &mul_F16_F16_F16;

1902

break;

1903

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1904

case DataType::F32:

1905

_func_float = &mul_F32_F32_F32;

1906

break;

1907

default:

1908

ARM_COMPUTE_ERROR("You called with the wrong img formats");

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1909

}

1910

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1911

// Configure kernel window

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

1912

Window win;

1913

std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1914

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1915

ICpuKernel::configure(win);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1916

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1917

Fadi Arafeh

73bb6b7

2022-10-06 16:20:14 +0000

[diff] [blame]

1918

size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const

1919

{

1920

ARM_COMPUTE_UNUSED(thread_count);

1921

1922

#if defined(ENABLE_FP32_KERNELS)

1923

if(this->_func_float == &mul_F32_F32_F32)

1924

{

1925

size_t mws = ICPPKernel::default_mws;

1926

if(platform.get_cpu_model() == CPUModel::N1)

1927

{

1928

mws = default_mws_N1_fp32_neon;

1929

}

1930

else if(platform.get_cpu_model() == CPUModel::V1)

1931

{

1932

mws = default_mws_V1_fp32_neon;

1933

}

1934

else

1935

{

fadara01

e112ef1

2022-11-22 18:25:55 +0000

[diff] [blame]

1936

if(_split_dimension == Window::DimX)

1937

{

1938

// Don't split the work load too small if the tensor has been reinterpreted as 1D.

1939

// This number is loosely chosen as threading overhead in each platform varies wildly.

1940

return default_mws_other_platforms_1d_tensor;

1941

}

1942

return default_mws;

Fadi Arafeh

73bb6b7

2022-10-06 16:20:14 +0000

[diff] [blame]

1943

}

1944

1945

// tensor is 1D or was re-interpreted as 1D

1946

if(this->window().shape().num_dimensions() == 1)

{

return mws;

}

else

{

// scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one

1953

// that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small

1954

// but the other sizes are large, which boosts performance.

1955

mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));

1956

return std::max(static_cast<size_t>(1), mws);

1957

}

1958

}

1959

#else /* ENABLE_FP32_KERNELS */

1960

ARM_COMPUTE_UNUSED(platform);

1961

#endif /* ENABLE_FP32_KERNELS */

fadara01

e112ef1

2022-11-22 18:25:55 +0000

[diff] [blame]

1962

if(_split_dimension == Window::DimX)

1963

{

1964

// Don't split the work load too small if the tensor has been reinterpreted as 1D.

1965

// This number is loosely chosen as threading overhead in each platform varies wildly.

1966

return default_mws_other_platforms_1d_tensor;

1967

}

1968

return default_mws;

Fadi Arafeh

73bb6b7

2022-10-06 16:20:14 +0000

[diff] [blame]

1969

}

1970

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1971

Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,

1972

RoundingPolicy rounding_policy)

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1973

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1974

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

1975

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1976

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1977

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1978

}

1979

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1980

void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1981

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1982

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1983

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1984

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1985

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1986

auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

1987

auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

1988

auto dst = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

6eb7345

2020-07-02 17:39:25 +0100

[diff] [blame]

1989

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1990

if(_func_quantized != nullptr)

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

1991

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1992

(*_func_quantized)(src1, src2, dst, window, _scale);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1993

}

1994

else if(_func_int != nullptr)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1995

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1996

(*_func_int)(src1, src2, dst, window, _scale_exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1997

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1998

else

1999

{

2000

ARM_COMPUTE_ERROR_ON(_func_float == nullptr);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2001

(*_func_float)(src1, src2, dst, window, _scale);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2002

}

2003

}

Viet-Hoa Do

d4a9cc0

2022-11-08 12:01:21 +0000

[diff] [blame]

2004

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2005

const char *CpuMulKernel::name() const

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2006

{

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2007

return "CpuMulKernel";

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2008

}

Viet-Hoa Do

d4a9cc0

2022-11-08 12:01:21 +0000

[diff] [blame]

2009

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2010

namespace

2011

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2012

Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2013

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2014

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);

2015

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2016

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2017

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2018

2019

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

2020

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2021

// Validate in case of configured dst

2022

if(dst->total_size() > 0)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2023

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2024

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);

2025

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

}

return Status{};

}

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

2030

} // namespace

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2031

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2032

void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2033

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2034

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

2035

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

2036

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2037

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2038

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2039

// Auto initialize dst if not initialized

2040

const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());

2041

auto_init_if_empty(*dst, out_info);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2042

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2043

// Configure kernel window

SiCongLi

c7b1e84

2021-02-22 14:28:33 +0000

[diff] [blame]

2044

Window win = calculate_max_window(out_shape);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2045

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2046

ICpuKernel::configure(win);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2047

}

2048

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2049

Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2050

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2051

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

2052

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

return Status{};

}

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2057

void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2058

{

2059

ARM_COMPUTE_UNUSED(info);

2060

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2061

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2062

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2063

auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

2064

auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

2065

auto dst = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

6eb7345

2020-07-02 17:39:25 +0100

[diff] [blame]

2066

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2067

c_mul_F32_F32_F32_n(src1, src2, dst, window);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2068

}

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2069

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2070

const char *CpuComplexMulKernel::name() const

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2071

{

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2072

return "CpuComplexMulKernel";

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2073

}

2074

} // namespace kernels

2075

} // namespace cpu

Manuel Bottini