Blame - src/cpu/kernels/CpuMulKernel.cpp - ml/ComputeLibrary

2017-11-28 18:29:43 +0000

[diff] [blame]

55

{

56

ARM_COMPUTE_UNUSED(overflow_policy);

57

ARM_COMPUTE_UNUSED(rounding_policy);

58

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

59

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);

60

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

61

DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

62

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

63

DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

64

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

65

DataType::S16, DataType::QSYMM16,

66

DataType::S32, DataType::F16, DataType::F32);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

67

if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

68

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

69

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);

Georgios Pinitas

d7d7e90

2019-12-18 15:40:54 +0000

[diff] [blame]

70

ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

71

}

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

72

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

73

if(dst->total_size() > 0)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

74

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

75

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

76

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

77

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

78

// clang-format off

79

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

80

!(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&

81

!(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

82

!(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&

83

!(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

84

!(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&

85

!(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

86

, "Invalid data type combination");

87

// clang-format on

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

88

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

89

}

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

90

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

91

if(std::abs(scale - scale255_constant) < 0.00001f)

92

{

93

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

94

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,

95

"Scale == 1/255 is not supported if input and dst are of data type S32");

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);

100

101

int exponent = 0;

102

const float normalized_mantissa = std::frexp(scale, &exponent);

103

104

// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15

105

// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14

106

// Moreover, it will be negative as we deal with 1/2^n

107

ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");

108

}

109

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

110

return Status{};

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

111

}

112

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

113

/* Scales a given vector by 1/255.

114

*

115

* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.

116

*

117

* @param in Input vector to scale.

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

118

* @return Scaled dst rounded to nearest (round half up).

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

119

*/

120

inline int32x4_t scale255_S32_S32(int32x4_t in)

121

{

122

// Scale

123

const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);

124

// Round to nearest (round half up)

125

// Add +0.5 for all values

126

// Afterwards vcvt rounds toward zero

127

return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));

128

}

129

130

inline uint16x8_t scale255_U16_U16(uint16x8_t in)

131

{

132

const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));

133

const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));

134

return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));

135

}

136

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

137

template <typename T>

138

inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type

139

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

140

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

141

return vquantize_signed(val, info);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

142

}

143

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

144

template <typename T>

145

inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type

146

vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

147

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

148

return vquantize(val, info);

Pablo Tello

2019-12-10 11:28:53 +0000

[diff] [blame]

149

}

150

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

151

template <typename T>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

152

void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

153

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

154

// Create input windows

155

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

156

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

157

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

158

159

// Clear X Dimension on execution window as we handle manually

160

win.set(Window::DimX, Window::Dimension(0, 1, 1));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

161

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

162

const int window_step_x = 16 / sizeof(T);

163

const auto window_start_x = static_cast<int>(window.x().start());

164

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

165

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

166

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

167

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

168

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

169

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

170

if(is_broadcast_across_x)

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

171

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

172

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

173

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

174

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

175

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

176

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

177

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

178

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

179

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

180

// Clear X Dimension on execution window as we handle manually

181

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

182

183

Iterator broadcast_input(broadcast_tensor, broadcast_win);

184

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

185

Iterator dst(out, win);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

186

187

using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;

188

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

189

execute_window_loop(

190

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

191

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

192

const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

193

const auto output_ptr = reinterpret_cast<T *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

194

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

195

const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());

196

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

197

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

198

// Compute window_step_x elements per iteration

199

int x = window_start_x;

200

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

201

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

202

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

203

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

204

// Dequantize inputs

205

const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);

206

const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

207

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

208

const float32x4x4_t out_f32x4x4 =

209

{

210

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

211

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

212

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

213

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

214

};

215

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

216

// Quantize dst

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

217

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

218

wrapper::vstore(output_ptr + x, result);

219

}

220

221

// Compute left-over elements

222

for(; x < window_end_x; ++x)

223

{

224

// Dequantize inputs

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

225

const T src1 = *(non_broadcast_input_ptr + x);

226

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

227

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);

228

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

229

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

230

// Quantize dst

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

231

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

232

*(output_ptr + x) = tmp_qua;

233

}

234

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

235

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

236

}

237

else

238

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

239

const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();

240

const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

241

242

// Clear X Dimension on execution window as we handle manually

243

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

244

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

245

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

246

Iterator input1(src1, input1_win);

247

Iterator input2(src2, input2_win);

248

Iterator dst(out, win);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

249

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

250

execute_window_loop(

251

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

252

{

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

253

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

254

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

255

const auto output_ptr = reinterpret_cast<T *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

256

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

257

// Compute window_step_x elements per iteration

258

int x = window_start_x;

259

for(; x <= (window_end_x - window_step_x); x += window_step_x)

260

{

261

const auto input1_q = wrapper::vloadq(input1_ptr + x);

262

const auto input2_q = wrapper::vloadq(input2_ptr + x);

263

264

// Dequantize inputs

265

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

266

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

267

268

const float32x4x4_t out_f32x4x4 =

269

{

270

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

271

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

272

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

273

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

274

};

275

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

276

// Quantize dst

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

277

const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);

278

wrapper::vstore(output_ptr + x, result);

279

}

280

281

// Compute left-over elements

282

for(; x < window_end_x; ++x)

283

{

284

// Dequantize inputs

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

285

const T src1 = *(input1_ptr + x);

286

const T src2 = *(input2_ptr + x);

287

const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);

288

const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

289

const float tmp_f = tmp_in1 * tmp_in2;

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

290

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

291

// Quantize dst

Michele Di Giorgio

2020-07-22 15:17:43 +0100

[diff] [blame]

292

const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

293

*(output_ptr + x) = tmp_qua;

294

}

295

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

296

input1, input2, dst);

Sheri Zhang

2020-07-16 15:52:25 +0100

[diff] [blame]

297

}

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

298

}

299

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

300

bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)

301

{

302

const auto iq0 = src0->quantization_info().uniform();

303

const auto iq1 = src1->quantization_info().uniform();

304

const auto oq = dst->quantization_info().uniform();

305

306

const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;

307

308

if(multiplier < -8191.f || multiplier > 8191.f)

309

{

310

//The multiplier cannot be stored as a 14.18 signed fixed-point number

return false;

}

const auto offset_out = float(oq.offset);

315

316

const auto max_result = multiplier * (256) * (256) + offset_out;

317

318

if(max_result > 8191.f)

319

{

320

//It might not be possible to store the result as a 14.18 signed fixed-point number.

return false;

}

return true;

}

template <typename ScalarType>

328

void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)

329

{

330

const auto in0_info = src0->info();

331

const auto in1_info = src1->info();

332

333

const auto &in0_shape = in0_info->tensor_shape();

334

const auto &in1_shape = in1_info->tensor_shape();

335

336

// Create input windows.

337

Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);

338

Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);

339

340

// Clear the x dimension on the execution window as we process the whole row each iteration.

341

Window win = window;

342

win.set(Window::DimX, Window::Dimension(0, 1, 1));

343

344

constexpr int window_step_x = 16;

345

const auto window_start_x = window.x().start();

346

const auto window_end_x = window.x().end();

347

const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();

348

349

const auto iq0_info = in0_info->quantization_info().uniform();

350

const auto iq1_info = in1_info->quantization_info().uniform();

351

const auto oq_info = dst->info()->quantization_info().uniform();

352

353

const auto in0_offset = iq0_info.offset;

354

const auto in1_offset = iq1_info.offset;

355

const auto out_offset = oq_info.offset;

356

const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;

357

358

constexpr int32_t two_pwr18i = 262144;

359

constexpr float two_pwr18f = 262144.f;

360

361

const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);

362

const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);

363

const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);

364

const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);

365

366

if(is_broadcast_across_x)

367

{

368

// Prefix: a = non-broadcast, b = broadcast.

369

370

const auto is_broadcast_input_1 = in1_win.x().step() == 0;

371

auto a_win = is_broadcast_input_1 ? in0_win : in1_win;

372

auto b_win = is_broadcast_input_1 ? in1_win : in0_win;

373

const auto a_tensor = is_broadcast_input_1 ? src0 : src1;

374

const auto b_tensor = is_broadcast_input_1 ? src1 : src0;

375

376

const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;

377

const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;

378

#ifndef __aarch64__

379

const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;

380

const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;

381

#endif //__aarch64__

382

const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());

383

384

// Clear the x dimension on the execution window as we process the whole row each iteration.

385

a_win.set(Window::DimX, Window::Dimension(0, 1, 1));

386

387

Iterator a_input_it(a_tensor, a_win);

388

Iterator b_input_it(b_tensor, b_win);

389

Iterator out_it(dst, win);

390

391

execute_window_loop(

392

win, [&](const Coordinates &)

393

{

394

const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());

395

const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());

396

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

397

398

const auto b_val = *b_ptr;

399

const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);

400

const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());

401

402

const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());

403

const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());

404

405

int x = window_start_x;

406

407

for(; x <= (window_end_x - window_step_x); x += window_step_x)

408

{

409

// Load the inputs.

410

const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);

411

412

// Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.

413

const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));

414

const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));

415

416

const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);

417

const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);

418

const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);

419

const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);

420

421

const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);

422

const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);

423

const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);

424

const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);

425

426

const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);

427

const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);

428

const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);

429

const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);

430

431

// These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.

432

const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));

433

const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));

434

const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));

435

const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));

436

437

const auto vout_15p1_0 = wrapper::vcombine(

vout_15p1_00,

vout_15p1_01);

const auto vout_15p1_1 = wrapper::vcombine(

442

vout_15p1_10,

443

vout_15p1_11);

444

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

445

446

const auto vout_8p0 = wrapper::vcombine(

447

wrapper::vqrshrn<2>(vout_15p1_0),

448

wrapper::vqrshrn<2>(vout_15p1_1));

449

wrapper::vstore(out_ptr + x, vout_8p0);

450

}

451

452

//Process the left-over elements.

453

for(; x < window_end_x; ++x)

454

{

455

#ifdef __aarch64__

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

456

out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

457

b_val) - b_offset_16p0)) + out_offset_14p18)));

458

#else //__aarch64__

459

out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));

#endif //__aarch64__

}

},

a_input_it, b_input_it, out_it);

}

else

{

const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());

468

const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());

469

const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());

470

const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());

471

472

// Clear the x dimension on the execution window as we process the whole row each iteration.

473

in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));

474

in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

475

476

Iterator in0_it(src0, in0_win);

477

Iterator in1_it(src1, in1_win);

478

Iterator out_it(dst, win);

479

480

execute_window_loop(

481

win, [&](const Coordinates &)

482

{

483

const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());

484

const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());

485

const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());

486

487

int x = window_start_x;

488

489

for(; x <= (window_end_x - window_step_x); x += window_step_x)

490

{

491

// Load the inputs.

492

const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);

493

const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);

494

495

// Widen the input elements to signed 16-bit regardless of the input signedness.

496

const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));

497

const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));

498

const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));

499

const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));

500

501

const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);

502

const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);

503

const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);

504

const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);

505

506

const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);

507

const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);

508

const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);

509

const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);

510

511

const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);

512

const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);

513

const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);

514

const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);

515

516

const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);

517

const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);

518

const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);

519

const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);

520

521

// These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.

522

const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));

523

const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));

524

const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));

525

const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));

526

527

const auto vout_14p2_0 = wrapper::vcombine(

vout_14p2_00,

vout_14p2_01);

const auto vout_14p2_1 = wrapper::vcombine(

vout_14p2_10,

vout_14p2_11);

const auto vout_8p0 = wrapper::vcombine(

536

wrapper::vqrshrn<2>(vout_14p2_0),

537

wrapper::vqrshrn<2>(vout_14p2_1));

538

wrapper::vstore(out_ptr + x, vout_8p0);

539

}

540

541

//Process the left-over elements.

542

for(; x < window_end_x; ++x)

543

{

544

#ifdef __aarch64__

Omar Al Khatib

b230a1f

2022-11-01 17:01:24 +0000

[diff] [blame]

545

out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

546

in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));

547

#else //__aarch64__

548

out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));

#endif //__aarch64__

}

},

in0_it, in1_it, out_it);

}

}

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

556

void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

557

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

558

const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();

559

const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

560

const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();

561

562

// Create input windows

563

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

564

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

565

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

566

567

// Clear X Dimension on execution window as we handle manually

568

win.set(Window::DimX, Window::Dimension(0, 1, 1));

569

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

570

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

571

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

572

Iterator input1(src1, input1_win);

573

Iterator input2(src2, input2_win);

574

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

575

576

const int window_step_x = 16;

577

const auto window_start_x = static_cast<int>(window.x().start());

578

const auto window_end_x = static_cast<int>(window.x().end());

579

580

const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };

581

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

582

execute_window_loop(

583

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

584

{

585

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

586

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

587

const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

588

589

// Compute window_step_x elements per iteration

590

int x = window_start_x;

591

for(; x <= (window_end_x - window_step_x); x += window_step_x)

592

{

593

const qsymm16x8x2_t input1_q =

594

{

595

{

596

vld1q_s16(input1_ptr + x),

597

vld1q_s16(input1_ptr + x + 8),

598

}

599

};

600

const qsymm16x8x2_t input2_q =

601

{

602

{

603

vld1q_s16(input2_ptr + x),

604

vld1q_s16(input2_ptr + x + 8),

}

};

// Dequantize inputs

const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);

610

const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);

611

612

const float32x4x4_t out_f32x4x4 =

613

{

614

vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),

615

vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),

616

vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),

617

vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),

618

};

619

620

const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);

621

vst1q_s16(output_ptr + x, result.val[0]);

622

vst1q_s16(output_ptr + x + 8, result.val[1]);

623

}

624

625

// Compute left-over elements

626

for(; x < window_end_x; ++x)

627

{

628

// Dequantize inputs

629

float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;

630

float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;

631

float tmp_f = tmp_in1 * tmp_in2;

632

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

633

// Quantize dst, lrintf() has same rounding mode as vcombine_s16

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

634

int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);

635

qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

636

*(output_ptr + x) = tmp_qua;

637

}

638

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

639

input1, input2, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

640

}

641

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

642

void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

643

{

644

ARM_COMPUTE_UNUSED(scale);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

645

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

646

// Create input windows

647

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

648

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

649

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

650

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

651

// Clear X Dimension on execution window as we handle manually

652

win.set(Window::DimX, Window::Dimension(0, 1, 1));

653

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

654

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

655

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

656

Iterator input1(src1, input1_win);

657

Iterator input2(src2, input2_win);

658

Iterator dst(out, win);

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

659

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

660

const int window_step_x = 16;

661

const auto window_start_x = static_cast<int>(window.x().start());

662

const auto window_end_x = static_cast<int>(window.x().end());

663

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

664

execute_window_loop(

665

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

666

{

667

const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());

668

const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

669

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

670

671

// Compute window_step_x elements per iteration

672

int x = window_start_x;

673

for(; x <= (window_end_x - window_step_x); x += window_step_x)

674

{

675

const qsymm16x8x2_t input1_q =

676

{

677

{

678

vld1q_s16(input1_ptr + x),

679

vld1q_s16(input1_ptr + x + 8),

680

}

681

};

682

const qsymm16x8x2_t input2_q =

683

{

684

{

685

vld1q_s16(input2_ptr + x),

686

vld1q_s16(input2_ptr + x + 8),

}

};

const int32x4x4_t in1_s32 =

691

{

692

{

693

vmovl_s16(vget_low_s16(input1_q.val[0])),

694

vmovl_s16(vget_high_s16(input1_q.val[0])),

695

vmovl_s16(vget_low_s16(input1_q.val[1])),

696

vmovl_s16(vget_high_s16(input1_q.val[1])),

697

}

698

};

699

const int32x4x4_t in2_s32 =

700

{

701

{

702

vmovl_s16(vget_low_s16(input2_q.val[0])),

703

vmovl_s16(vget_high_s16(input2_q.val[0])),

704

vmovl_s16(vget_low_s16(input2_q.val[1])),

705

vmovl_s16(vget_high_s16(input2_q.val[1])),

}

};

const int32x4x4_t result =

710

{

711

{

712

vmulq_s32(in1_s32.val[0], in2_s32.val[0]),

713

vmulq_s32(in1_s32.val[1], in2_s32.val[1]),

714

vmulq_s32(in1_s32.val[2], in2_s32.val[2]),

715

vmulq_s32(in1_s32.val[3], in2_s32.val[3]),

}

};

vst1q_s32(output_ptr + x, result.val[0]);

720

vst1q_s32(output_ptr + x + 4, result.val[1]);

721

vst1q_s32(output_ptr + x + 8, result.val[2]);

722

vst1q_s32(output_ptr + x + 12, result.val[3]);

723

}

724

725

// Compute left-over elements

726

for(; x < window_end_x; ++x)

727

{

728

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

729

*(output_ptr + x) = tmp;

730

}

731

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

732

input1, input2, dst);

Manuel Bottini

2019-06-26 15:17:09 +0100

[diff] [blame]

733

}

734

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

735

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

736

void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

737

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

738

// Create input windows

739

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

740

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

741

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

742

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

743

// Clear X Dimension on execution window as we handle manually

744

win.set(Window::DimX, Window::Dimension(0, 1, 1));

745

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

746

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

747

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

748

Iterator input1(src1, input1_win);

749

Iterator input2(src2, input2_win);

750

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

751

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

752

const int window_step_x = 16 / sizeof(uint8_t);

753

const auto window_start_x = static_cast<int>(window.x().start());

754

const auto window_end_x = static_cast<int>(window.x().end());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

755

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

756

execute_window_loop(

757

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

758

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

759

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

760

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

761

const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

762

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

763

// Compute window_step_x elements per iteration

764

int x = window_start_x;

765

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

766

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

767

const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);

768

const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

769

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

770

uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));

771

const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));

772

uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));

773

const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));

774

775

tmp1_high = vmulq_u16(tmp1_high, tmp2_high);

776

tmp1_low = vmulq_u16(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_U16_U16(tmp1_high);

781

tmp1_low = scale255_U16_U16(tmp1_low);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp1_high = vqshlq_u16(tmp1_high, vn);

790

tmp1_low = vqshlq_u16(tmp1_low, vn);

}

else

{

tmp1_high = vshlq_u16(tmp1_high, vn);

795

tmp1_low = vshlq_u16(tmp1_low, vn);

}

}

if(is_sat)

{

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

800

vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

801

}

802

else

803

{

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

804

vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

}

}

// Compute left-over elements

809

for(; x < window_end_x; ++x)

810

{

811

uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

816

tmp = static_cast<uint16_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat && tmp > 255)

{

tmp = 255;

}

*(output_ptr + x) = static_cast<uint8_t>(tmp);

827

}

828

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

829

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

830

}

831

832

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

833

inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

834

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

835

int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));

836

const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));

837

int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));

838

const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

839

840

tmp1_high = vmulq_s32(tmp1_high, tmp2_high);

841

tmp1_low = vmulq_s32(tmp1_low, tmp2_low);

if(is_scale255)

{

tmp1_high = scale255_S32_S32(tmp1_high);

846

tmp1_low = scale255_S32_S32(tmp1_low);

}

else

{

// Right shift amount

851

const int32x4_t vn = vdupq_n_s32(-n);

852

// Left shift amount

853

const int32x4_t vnl = vdupq_n_s32(n);

854

// Calculate conversion bit

855

const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);

856

const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);

857

const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);

858

const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);

859

const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);

860

const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);

861

const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);

862

const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);

863

if(is_sat)

864

{

865

tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

866

tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

else

{

tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);

871

tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);

}

}

if(is_sat)

{

return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));

}

else

{

return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));

}

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

886

inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

887

{

888

const int16x8x2_t result =

889

{

890

{

891

// First 8 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

892

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

893

// Second 8 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

894

mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

};

return result;

}

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

902

void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

903

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

904

// Create input windows

905

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

906

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

907

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

908

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

909

// Clear X Dimension on execution window as we handle manually

910

win.set(Window::DimX, Window::Dimension(0, 1, 1));

911

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

912

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

913

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

914

Iterator input1(src1, input1_win);

915

Iterator input2(src2, input2_win);

916

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

917

918

const int window_step_x = 16;

919

const auto window_start_x = static_cast<int>(window.x().start());

920

const auto window_end_x = static_cast<int>(window.x().end());

921

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

922

execute_window_loop(

923

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

924

{

925

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

926

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

927

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

928

929

// Compute window_step_x elements per iteration

930

int x = window_start_x;

931

for(; x <= (window_end_x - window_step_x); x += window_step_x)

932

{

933

const int16x8x2_t ta1 =

934

{

935

{

936

vld1q_s16(input1_ptr + x),

937

vld1q_s16(input1_ptr + x + 8),

938

}

939

};

940

const int16x8x2_t ta2 =

941

{

942

{

943

vld1q_s16(input2_ptr + x),

944

vld1q_s16(input2_ptr + x + 8),

945

}

946

};

947

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

948

949

vst1q_s16(output_ptr + x, result.val[0]);

950

vst1q_s16(output_ptr + x + 8, result.val[1]);

951

}

952

953

// Compute left-over elements

954

for(; x < window_end_x; ++x)

955

{

956

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

961

962

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

973

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

979

}

980

*(output_ptr + x) = static_cast<int16_t>(tmp);

981

}

982

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

983

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

984

}

985

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

986

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

987

inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

988

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

989

const int32x2_t input1_1 = vget_low_s32(src1);

990

const int32x2_t input2_1 = vget_low_s32(src2);

991

const int32x2_t input1_2 = vget_high_s32(src1);

992

const int32x2_t input2_2 = vget_high_s32(src2);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

993

994

int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);

995

int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);

996

997

// Apply scaling, conversion and rounding (round to zero)

998

// Right shift amount

999

const int64x2_t vn = vdupq_n_s64(-n);

1000

// Left shift amount

1001

const int64x2_t vnl = vdupq_n_s64(n);

1002

// Calculate conversion bit

1003

const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);

1004

const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);

1005

const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);

1006

const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);

1007

1008

const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);

1009

const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);

1010

const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);

1011

const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);

1012

if(is_sat)

1013

{

1014

tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);

1015

tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);

1016

return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));

}

else

{

tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);

1021

tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);

1022

return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));

}

}

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1026

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1027

inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1028

{

1029

const int32x4x2_t result =

1030

{

1031

{

1032

// First 4 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1033

mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1034

// Second 4 elements

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1035

mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

}

};

return result;

}

template <bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1043

void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1044

{

1045

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1046

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1047

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1048

1049

// Clear X Dimension on execution window as we handle manually

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1050

Window win = window;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1051

win.set(Window::DimX, Window::Dimension(0, 1, 1));

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1052

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1053

const int window_step_x = 8;

1054

const auto window_start_x = static_cast<int>(window.x().start());

1055

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1056

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1057

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1058

if(is_broadcast_across_x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1059

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1060

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1061

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1062

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1063

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1064

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1065

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1066

// Clear X Dimension on execution window as we handle manually

1067

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1068

1069

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1070

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1071

Iterator dst(out, win);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1072

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1073

execute_window_loop(

1074

win, [&](const Coordinates &)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1075

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1076

const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1077

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1078

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1079

const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());

1080

const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1081

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1082

// Compute window_step_x elements per iteration

1083

int x = window_start_x;

1084

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1085

{

1086

const int32x4x2_t broadcast_v =

{

{

broadcast_value_vec,

broadcast_value_vec,

}

};

const int32x4x2_t non_broadcast_v =

1094

{

1095

{

1096

vld1q_s32(non_broadcast_input_ptr + x),

1097

vld1q_s32(non_broadcast_input_ptr + x + 4),

1098

}

1099

};

1100

const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);

1101

1102

vst1q_s32(output_ptr + x, result.val[0]);

1103

vst1q_s32(output_ptr + x + 4, result.val[1]);

1104

}

1105

1106

// Compute left-over elements

1107

for(; x < window_end_x; ++x)

1108

{

1109

int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));

if(tmp >= 0)

{

tmp >>= n;

}

else

{

Michalis Spyrou

7f8caf7

2021-05-13 13:35:30 +0100

[diff] [blame]

1117

uint64_t mask = ((uint64_t)1u << n) - 1;

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1118

tmp = (tmp + static_cast<int64_t>(mask)) >> n;

}

if(is_sat)

{

tmp = utility::clamp<int64_t, int32_t>(tmp);

1123

}

1124

*(output_ptr + x) = static_cast<int32_t>(tmp);

1125

}

1126

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1127

broadcast_input, non_broadcast_input, dst);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1132

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1133

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1134

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1135

Iterator input1(src1, input1_win);

1136

Iterator input2(src2, input2_win);

1137

Iterator dst(out, win);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1138

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1139

execute_window_loop(

1140

win, [&](const Coordinates &)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1141

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1142

const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());

1143

const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1144

const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1145

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1146

// Compute window_step_x elements per iteration

1147

int x = window_start_x;

1148

for(; x <= (window_end_x - window_step_x); x += window_step_x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1149

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1150

const int32x4x2_t ta1 =

1151

{

1152

{

1153

vld1q_s32(input1_ptr + x),

1154

vld1q_s32(input1_ptr + x + 4),

1155

}

1156

};

1157

const int32x4x2_t ta2 =

1158

{

1159

{

1160

vld1q_s32(input2_ptr + x),

1161

vld1q_s32(input2_ptr + x + 4),

1162

}

1163

};

1164

const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);

1165

1166

vst1q_s32(output_ptr + x, result.val[0]);

1167

vst1q_s32(output_ptr + x + 4, result.val[1]);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1168

}

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1169

1170

// Compute left-over elements

1171

for(; x < window_end_x; ++x)

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1172

{

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1173

int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));

if(tmp >= 0)

{

tmp >>= n;

}

else

{

Michalis Spyrou

7f8caf7

2021-05-13 13:35:30 +0100

[diff] [blame]

1181

uint64_t mask = ((uint64_t)1u << n) - 1;

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1182

tmp = (tmp + static_cast<int64_t>(mask)) >> n;

}

if(is_sat)

{

tmp = utility::clamp<int64_t, int32_t>(tmp);

1187

}

1188

*(output_ptr + x) = static_cast<int32_t>(tmp);

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1189

}

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1190

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1191

input1, input2, dst);

SiCong Li

2020-09-24 17:34:23 +0100

[diff] [blame]

1192

}

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1193

}

1194

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1195

void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1196

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1197

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1198

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1199

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1200

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1201

// Clear X Dimension on execution window as we handle manually

1202

Window win = window;

1203

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1204

1205

constexpr int window_step_x = 16 / sizeof(float);

1206

const auto window_start_x = static_cast<int>(window.x().start());

1207

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1208

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1209

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1210

using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;

1211

1212

if(is_broadcast_across_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1213

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1214

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1215

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1216

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1217

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1218

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1219

1220

// Clear X Dimension on execution window as we handle manually

1221

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1222

1223

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1224

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1225

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1226

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1227

execute_window_loop(

1228

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1229

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1230

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1231

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1232

1233

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

1234

const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});

1235

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

1236

1237

// Compute window_step_x elements per iteration

1238

int x = window_start_x;

1239

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1240

{

1241

const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);

1242

auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);

1243

wrapper::vstore(output_ptr + x, res);

1244

}

1245

1246

// Compute left-over elements

1247

for(; x < window_end_x; ++x)

1248

{

1249

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

1250

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

1251

}

1252

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1253

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1258

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1259

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1260

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1261

Iterator input1(src1, input1_win);

1262

Iterator input2(src2, input2_win);

1263

Iterator dst(out, win);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1264

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1265

execute_window_loop(

1266

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1267

{

1268

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

1269

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1270

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1271

1272

// Compute window_step_x elements per iteration

1273

int x = window_start_x;

1274

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1275

{

1276

const auto ta1 = wrapper::vloadq(input1_ptr + x);

1277

const auto ta2 = wrapper::vloadq(input2_ptr + x);

1278

const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});

1279

const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);

1280

wrapper::vstore(output_ptr + x, res);

1281

}

1282

1283

// Compute left-over elements

1284

for(; x < window_end_x; ++x)

1285

{

1286

const auto ta1 = *(input1_ptr + x);

1287

const auto ta2 = *(input2_ptr + x);

1288

*(output_ptr + x) = ta1 * ta2 * scale;

1289

}

1290

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1291

input1, input2, dst);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1292

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1293

}

1294

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1295

void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1296

{

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1297

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1298

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1299

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1300

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1301

// Clear X Dimension on execution window as we handle manually

1302

Window win = window;

1303

win.set(Window::DimX, Window::Dimension(0, 1, 1));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1304

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1305

constexpr int window_step_x = 8 / sizeof(float);

1306

const auto window_start_x = static_cast<int>(window.x().start());

1307

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1308

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1309

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1310

using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;

1311

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1312

if(is_broadcast_across_x)

1313

{

1314

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1315

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1316

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1317

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1318

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1319

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1320

// Clear X Dimension on execution window as we handle manually

1321

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1322

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1323

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1324

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1325

Iterator dst(out, win);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1326

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1327

execute_window_loop(

1328

win, [&](const Coordinates &)

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1329

{

1330

const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1331

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1332

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1333

const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());

1334

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1335

// Compute window_step_x elements per iteration

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1336

int x = window_start_x;

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1337

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1338

{

1339

const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);

1340

float32x4_t b = vdupq_n_f32(broadcast_value);

1341

1342

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

1343

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

1344

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

1345

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

1346

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

1347

1348

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

1349

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

1350

1351

float32x4_t res = wrapper::vmul(tmp0, b);

1352

b = wrapper::vmul(b, mask);

1353

1354

res = wrapper::vmla(res, tmp1, b);

1355

wrapper::vstore(output_ptr + 2 * x, res);

1356

}

1357

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1358

// Compute left-over elements

1359

for(; x < window_end_x; ++x)

1360

{

Sheri Zhang

2020-11-09 15:12:32 +0000

[diff] [blame]

1361

const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);

1362

const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);

1363

auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);

1364

auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);

1365

*(output_ptr + 2 * x) = res1;

1366

*(output_ptr + 2 * x + 1) = res2;

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1367

}

1368

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1369

broadcast_input, non_broadcast_input, dst);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

}

else

{

// Clear X Dimension on execution window as we handle manually

1374

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1375

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1376

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1377

Iterator input1(src1, input1_win);

1378

Iterator input2(src2, input2_win);

1379

Iterator dst(out, win);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1380

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1381

execute_window_loop(

1382

win, [&](const Coordinates &)

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1383

{

1384

const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());

1385

const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1386

const auto output_ptr = reinterpret_cast<float *>(dst.ptr());

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1387

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1388

// Compute window_step_x elements per iteration

1389

int x = window_start_x;

1390

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1391

{

1392

const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);

1393

float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);

1394

1395

const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };

1396

const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

1397

const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});

1398

const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});

1399

const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});

1400

1401

const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);

1402

const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);

1403

1404

float32x4_t res = wrapper::vmul(tmp0, b);

1405

1406

b = wrapper::vrev64(b);

1407

b = wrapper::vmul(b, mask);

1408

1409

res = wrapper::vmla(res, tmp1, b);

1410

wrapper::vstore(output_ptr + 2 * x, res);

1411

}

1412

1413

// Compute left-over elements

1414

for(; x < window_end_x; ++x)

1415

{

1416

const auto a0 = *(input1_ptr + 2 * x);

1417

const auto a1 = *(input1_ptr + 2 * x + 1);

1418

const auto b0 = *(input2_ptr + 2 * x);

1419

const auto b1 = *(input2_ptr + 2 * x + 1);

1420

auto res1 = a0 * b0 - a1 * b1;

1421

auto res2 = a0 * b1 + a1 * b0;

1422

*(output_ptr + 2 * x) = res1;

1423

*(output_ptr + 2 * x + 1) = res2;

1424

}

1425

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1426

input1, input2, dst);

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

1427

}

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

1428

}

1429

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1430

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1431

void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1432

{

1433

// Create input windows

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1434

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1435

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1436

1437

// Clear X Dimension on execution window as we handle manually

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1438

Window win = window;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1439

win.set(Window::DimX, Window::Dimension(0, 1, 1));

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1440

constexpr int window_step_x = 16;

1441

const auto window_start_x = static_cast<int>(window.x().start());

1442

const auto window_end_x = static_cast<int>(window.x().end());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1443

const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1444

if(is_broadcast_across_x)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1445

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1446

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

1447

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

1448

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1449

const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;

1450

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1451

// Clear X Dimension on execution window as we handle manually

1452

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1453

Iterator broadcast_input(broadcast_tensor, broadcast_win);

1454

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1455

Iterator dst(out, win);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1456

execute_window_loop(

1457

win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1458

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1459

const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1460

const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1461

const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1462

const float16x8x2_t broadcast_value_vec =

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1463

{

1464

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1465

vdupq_n_f16(broadcast_value),

1466

vdupq_n_f16(broadcast_value),

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1467

}

1468

};

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1469

const auto scale_vec = vdupq_n_f16(scale);

1470

// Compute window_step_x elements per iteration

1471

int x = window_start_x;

1472

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1473

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1474

const float16x8x2_t non_broadcast_v =

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1475

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1476

{

1477

vld1q_f16(non_broadcast_input_ptr + x),

1478

vld1q_f16(non_broadcast_input_ptr + x + 8),

1479

}

1480

};

1481

const float16x8x2_t result =

1482

{

1483

{

1484

vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),

1485

vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),

1486

}

1487

};

1488

vst1q_f16(output_ptr + x, result.val[0]);

1489

vst1q_f16(output_ptr + x + 8, result.val[1]);

1490

}

1491

// Compute left-over elements

1492

for(; x < window_end_x; ++x)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1493

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1494

const auto non_broadcast_v = *(non_broadcast_input_ptr + x);

1495

*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;

1496

}

1497

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1498

broadcast_input, non_broadcast_input, dst);

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

}

else

{

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1503

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1504

Iterator input1(src1, input1_win);

1505

Iterator input2(src2, input2_win);

1506

Iterator dst(out, win);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1507

execute_window_loop(

1508

win, [&](const Coordinates &)

Michele Di Giorgio

2020-03-30 14:10:20 +0100

[diff] [blame]

1509

{

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1510

const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());

1511

const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1512

const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1513

// Compute window_step_x elements per iteration

1514

int x = window_start_x;

1515

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1516

{

1517

const float16x8x2_t ta1 =

1518

{

1519

{

1520

vld1q_f16(input1_ptr + x),

1521

vld1q_f16(input1_ptr + x + 8),

1522

}

1523

};

1524

const float16x8x2_t ta2 =

1525

{

1526

{

1527

vld1q_f16(input2_ptr + x),

1528

vld1q_f16(input2_ptr + x + 8),

1529

}

1530

};

1531

const float16x8_t scale_vec = vdupq_n_f16(scale);

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1532

const float16x8x2_t result =

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1533

{

1534

{

1535

vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),

1536

vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),

1537

}

1538

};

1539

vst1q_f16(output_ptr + x, result.val[0]);

1540

vst1q_f16(output_ptr + x + 8, result.val[1]);

1541

}

1542

// Compute left-over elements

1543

for(; x < window_end_x; ++x)

1544

{

1545

const auto ta1 = *(input1_ptr + x);

1546

const auto ta2 = *(input2_ptr + x);

1547

*(output_ptr + x) = ta1 * ta2 * scale;

1548

}

1549

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1550

input1, input2, dst);

Pablo Marquez Tello

2021-01-27 14:16:13 +0000

[diff] [blame]

1551

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1552

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1553

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

df24618

2017-07-03 16:25:09 +0100

[diff] [blame]

1554

1555

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1556

void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1557

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1558

// Create input windows

1559

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1560

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1561

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1562

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1563

// Clear X Dimension on execution window as we handle manually

1564

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1565

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1566

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1567

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1568

Iterator input1(src1, input1_win);

1569

Iterator input2(src2, input2_win);

1570

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1571

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1572

const int window_step_x = 16 / sizeof(uint8_t);

1573

const auto window_start_x = static_cast<int>(window.x().start());

1574

const auto window_end_x = static_cast<int>(window.x().end());

1575

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1576

execute_window_loop(

1577

win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1578

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1579

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

1580

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1581

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1582

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1583

// Compute window_step_x elements per iteration

1584

int x = window_start_x;

1585

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1586

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1587

const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);

1588

const uint8x16_t av = wrapper::vloadq(input1_ptr + x);

1589

1590

uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));

1591

uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));

1592

tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));

1593

tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));

if(is_scale255)

{

tmp_low = scale255_U16_U16(tmp_low);

1598

tmp_high = scale255_U16_U16(tmp_high);

}

else

{

const int16x8_t vn = vdupq_n_s16(-n);

if(is_sat)

{

tmp_low = vqshlq_u16(tmp_low, vn);

1607

tmp_high = vqshlq_u16(tmp_high, vn);

}

else

{

tmp_low = vshlq_u16(tmp_low, vn);

1612

tmp_high = vshlq_u16(tmp_high, vn);

}

}

if(is_sat)

{

static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);

1619

1620

tmp_low = vminq_u16(tmp_low, max);

1621

tmp_high = vminq_u16(tmp_high, max);

1622

}

1623

1624

vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));

1625

vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1626

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1627

1628

// Compute left-over elements

1629

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1630

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1631

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1636

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

tmp >>= n;

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;

1646

}

1647

1648

*(output_ptr + x) = static_cast<int16_t>(tmp);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1649

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1650

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1651

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1652

}

1653

1654

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1655

void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1656

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1657

// Create input windows

1658

Window win = window;

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1659

Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());

1660

Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1661

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1662

// Clear X Dimension on execution window as we handle manually

1663

win.set(Window::DimX, Window::Dimension(0, 1, 1));

1664

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

1665

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1666

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1667

Iterator input1(src1, input1_win);

1668

Iterator input2(src2, input2_win);

1669

Iterator dst(out, win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1670

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1671

const int window_step_x = 16;

1672

const auto window_start_x = static_cast<int>(window.x().start());

1673

const auto window_end_x = static_cast<int>(window.x().end());

1674

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1675

execute_window_loop(

1676

win, [&](const Coordinates &)

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1677

{

1678

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

1679

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1680

const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1681

1682

// Compute window_step_x elements per iteration

1683

int x = window_start_x;

1684

for(; x <= (window_end_x - window_step_x); x += window_step_x)

1685

{

1686

const int16x8x2_t ta1 =

1687

{

1688

{

1689

vld1q_s16(input1_ptr + x),

1690

vld1q_s16(input1_ptr + x + 8),

1691

}

1692

};

1693

const uint8x8x2_t ta2u =

1694

{

1695

{

1696

vld1_u8(input2_ptr + x),

1697

vld1_u8(input2_ptr + x + 8),

1698

}

1699

};

1700

const int16x8x2_t ta2 =

1701

{

1702

{

1703

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),

1704

vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))

}

};

const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);

1709

1710

vst1q_s16(output_ptr + x, result.val[0]);

1711

vst1q_s16(output_ptr + x + 8, result.val[1]);

1712

}

1713

1714

// Compute left-over elements

1715

for(; x < window_end_x; ++x)

1716

{

1717

int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));

if(is_scale255)

{

float tmp_f = static_cast<float>(tmp) * scale255_constant;

1722

1723

tmp = static_cast<int32_t>(tmp_f + 0.5f);

}

else

{

if(tmp >= 0)

{

tmp >>= n;

}

else

{

uint32_t mask = (1u << n) - 1;

1734

tmp = (tmp + static_cast<int32_t>(mask)) >> n;

}

}

if(is_sat)

{

tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);

1740

}

1741

*(output_ptr + x) = static_cast<int16_t>(tmp);

1742

}

1743

},

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1744

input1, input2, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1745

}

1746

1747

template <bool is_scale255, bool is_sat>

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1748

void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1749

{

1750

// Simply swap the two input buffers

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1751

mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

} // namespace

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1755

void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1756

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1757

ARM_COMPUTE_UNUSED(rounding_policy);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1758

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

Georgios Pinitas

f0dea70

2017-07-03 18:17:28 +0100

[diff] [blame]

1759

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1760

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1761

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1762

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

1763

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1764

// Auto initialize dst if not initialized

1765

set_shape_if_empty(*dst, out_shape);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1766

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1767

_scale = scale;

1768

_scale_exponent = 0;

1769

_func_quantized = nullptr;

1770

_func_int = nullptr;

1771

_func_float = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1772

1773

bool is_scale_255 = false;

1774

// Check and validate scaling factor

1775

if(std::abs(scale - scale255_constant) < 0.00001f)

1776

{

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

is_scale_255 = true;

}

else

{

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1781

int exponent = 0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1782

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1783

std::frexp(scale, &exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1784

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1785

// Store the positive exponent. We know that we compute 1/2^n

1786

// Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5

1787

_scale_exponent = std::abs(exponent - 1);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1788

}

1789

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1790

const DataType dt_input1 = src1->data_type();

1791

const DataType dt_input2 = src2->data_type();

1792

const DataType dt_output = dst->data_type();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1793

const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);

1794

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1795

switch(dt_input1)

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1796

{

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1797

case DataType::QASYMM8:

1798

if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)

1799

{

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1800

if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))

1801

{

1802

_func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;

}

else

{

_func_quantized = &mul_saturate_quantized_8<uint8_t>;

1807

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1808

}

1809

break;

1810

case DataType::QASYMM8_SIGNED:

1811

if(dt_input2 == DataType::QASYMM8_SIGNED)

1812

{

Omar Al Khatib

2022-11-01 17:01:24 +0000

[diff] [blame]

1813

if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))

1814

{

1815

_func_quantized = &mul_q8_neon_fixedpoint<int8_t>;

}

else

{

_func_quantized = &mul_saturate_quantized_8<int8_t>;

1820

}

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1821

}

1822

break;

1823

case DataType::QSYMM16:

1824

if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)

1825

{

1826

_func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;

1827

}

1828

else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)

1829

{

1830

_func_int = &mul_QSYMM16_QSYMM16_S32;

}

break;

case DataType::S16:

if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;

1843

}

1844

}

1845

if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;

1854

}

1855

}

1856

break;

SiCong Li

2020-08-28 11:18:47 +0100

[diff] [blame]

1857

case DataType::S32:

1858

if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)

1859

{

1860

_func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;

1861

}

1862

break;

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1863

case DataType::U8:

1864

if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;

1873

}

1874

}

1875

else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;

1884

}

1885

}

1886

else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)

{

if(is_scale_255)

{

_func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;

}

else

{

_func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;

}

}

break;

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1899

case DataType::F16:

1900

_func_float = &mul_F16_F16_F16;

1901

break;

1902

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1903

case DataType::F32:

1904

_func_float = &mul_F32_F32_F32;

1905

break;

1906

default:

1907

ARM_COMPUTE_ERROR("You called with the wrong img formats");

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1908

}

1909

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1910

// Configure kernel window

Viet-Hoa Do

0d05b66

2022-09-09 15:39:05 +0100

[diff] [blame]

1911

Window win;

1912

std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1913

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1914

ICpuKernel::configure(win);

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1915

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1916

Fadi Arafeh

73bb6b7

2022-10-06 16:20:14 +0000

[diff] [blame^]

1917

size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const

1918

{

1919

ARM_COMPUTE_UNUSED(thread_count);

1920

1921

#if defined(ENABLE_FP32_KERNELS)

1922

if(this->_func_float == &mul_F32_F32_F32)

1923

{

1924

size_t mws = ICPPKernel::default_mws;

1925

if(platform.get_cpu_model() == CPUModel::N1)

1926

{

1927

mws = default_mws_N1_fp32_neon;

1928

}

1929

else if(platform.get_cpu_model() == CPUModel::V1)

1930

{

1931

mws = default_mws_V1_fp32_neon;

}

else

{

return ICPPKernel::default_mws;

1936

}

1937

1938

// tensor is 1D or was re-interpreted as 1D

1939

if(this->window().shape().num_dimensions() == 1)

{

return mws;

}

else

{

// scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one

1946

// that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small

1947

// but the other sizes are large, which boosts performance.

1948

mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));

1949

return std::max(static_cast<size_t>(1), mws);

1950

}

1951

}

1952

#else /* ENABLE_FP32_KERNELS */

1953

ARM_COMPUTE_UNUSED(platform);

1954

#endif /* ENABLE_FP32_KERNELS */

1955

return ICPPKernel::default_mws;

1956

}

1957

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1958

Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,

1959

RoundingPolicy rounding_policy)

Ioan-Cristian Szabo

2017-11-28 18:29:43 +0000

[diff] [blame]

1960

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1961

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

1962

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1963

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

1964

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1965

}

1966

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1967

void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1968

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1969

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1970

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1971

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1972

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1973

auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

1974

auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

1975

auto dst = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

6eb7345

2020-07-02 17:39:25 +0100

[diff] [blame]

1976

Sheri Zhang

2020-06-25 20:01:00 +0100

[diff] [blame]

1977

if(_func_quantized != nullptr)

Michalis Spyrou

861f0db

2018-02-26 16:47:58 +0000

[diff] [blame]

1978

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1979

(*_func_quantized)(src1, src2, dst, window, _scale);

Manuel Bottini

2019-02-22 17:54:22 +0000

[diff] [blame]

1980

}

1981

else if(_func_int != nullptr)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1982

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1983

(*_func_int)(src1, src2, dst, window, _scale_exponent);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1984

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1985

else

1986

{

1987

ARM_COMPUTE_ERROR_ON(_func_float == nullptr);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1988

(*_func_float)(src1, src2, dst, window, _scale);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1989

}

1990

}

Viet-Hoa Do

d4a9cc0

2022-11-08 12:01:21 +0000

[diff] [blame]

1991

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1992

const char *CpuMulKernel::name() const

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1993

{

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

1994

return "CpuMulKernel";

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

1995

}

Viet-Hoa Do

d4a9cc0

2022-11-08 12:01:21 +0000

[diff] [blame]

1996

1997

size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const

1998

{

1999

ARM_COMPUTE_UNUSED(platform, thread_count);

2000

2001

if(_split_dimension == Window::DimX)

2002

{

2003

// Don't split the work load too small if the tensor has been reinterpreted as 1D.

2004

// This number is loosely chosen as threading overhead in each platform varies wildly.

return 10240;

}

return default_mws;

}

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2011

namespace

2012

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2013

Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2014

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2015

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);

2016

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2017

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2018

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2019

2020

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

2021

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2022

// Validate in case of configured dst

2023

if(dst->total_size() > 0)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2024

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2025

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);

2026

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

}

return Status{};

}

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

2031

} // namespace

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2032

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2033

void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2034

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2035

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

2036

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));

Sheri Zhang

2020-09-23 11:22:50 +0100

[diff] [blame]

2037

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2038

const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2039

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2040

// Auto initialize dst if not initialized

2041

const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());

2042

auto_init_if_empty(*dst, out_info);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2043

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2044

// Configure kernel window

SiCongLi

c7b1e84

2021-02-22 14:28:33 +0000

[diff] [blame]

2045

Window win = calculate_max_window(out_shape);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2046

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2047

ICpuKernel::configure(win);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2048

}

2049

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2050

Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2051

{

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2052

ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);

2053

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

return Status{};

}

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2058

void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2059

{

2060

ARM_COMPUTE_UNUSED(info);

2061

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2062

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2063

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2064

auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);

2065

auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);

2066

auto dst = tensors.get_tensor(TensorType::ACL_DST);

Michalis Spyrou

6eb7345

2020-07-02 17:39:25 +0100

[diff] [blame]

2067

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2068

c_mul_F32_F32_F32_n(src1, src2, dst, window);

giuros01

2019-03-26 17:44:40 +0000

[diff] [blame]

2069

}

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2070

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2071

const char *CpuComplexMulKernel::name() const

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2072

{

Georgios Pinitas

2021-04-30 03:18:37 +0100

[diff] [blame]

2073

return "CpuComplexMulKernel";

Sheri Zhang

2021-03-16 17:35:08 +0000

[diff] [blame]

2074

}

2075

} // namespace kernels

2076

} // namespace cpu

Manuel Bottini