Blame - src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp - ml/ComputeLibrary

2019-10-15 11:09:33 +0100

[diff] [blame]

91

{

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

92

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

93

};

94

95

inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)

96

{

97

const int32_t current_h = base_h + h * dilation.y();

98

const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);

99

100

const int32_t current_w = base_w + w * dilation.x();

101

const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);

102

103

return is_valid_h && is_valid_w;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

104

}

105

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

106

template <typename T>

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

107

void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

108

const Size2D &dilation, const Window &window, bool has_biases)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

109

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

110

constexpr auto element_per_vector = vector_size / sizeof(T);

111

using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;

112

using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

113

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

114

const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);

115

116

const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});

117

118

Window execution_window = window;

119

execution_window.set(Window::DimX, dim_single_unit_step);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

120

121

Window win_input = window;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

122

win_input.set(Window::DimX, dim_manual_loop);

123

win_input.set(Window::DimY, dim_manual_loop);

124

win_input.set(Window::DimZ, dim_manual_loop);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

125

126

Window win_weights = win_input;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

127

win_weights.set(Window::DimW, dim_manual_loop);

128

129

Window win_output = window;

130

win_output.set(Window::DimX, dim_manual_loop);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

131

132

Iterator input_it(input, win_input);

133

Iterator weights_it(weights, win_weights);

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

134

Iterator output_it(output, win_output);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

135

Iterator biases_it{};

if(has_biases)

{

biases_it = Iterator(biases, win_weights);

140

}

141

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

142

execute_window_loop(execution_window, [&](const Coordinates & id)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

143

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

144

const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;

145

const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;

146

const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

147

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

148

auto const base_weights_ptr = weights_it.ptr();

149

uint32_t x = run_info.x_start;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

150

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

151

for(; x < run_info.x_leftover_start; x += run_info.x_step)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

152

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

153

VectorType acc = zero_vector;

154

auto weights_ptr = base_weights_ptr;

155

int64_t input_offset = base_input_offset;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

156

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

157

for(uint32_t h = 0; h < run_info.weights_height; ++h)

158

{

159

int64_t offs = input_offset + x * sizeof(T);

160

for(uint32_t w = 0; w < run_info.weights_width; ++w)

161

{

162

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

163

const auto input_vals = is_valid_region ?

164

wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :

165

zero_vector;

166

const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);

167

acc = wrapper::vmla(acc, weights_vals, input_vals);

168

169

offs += dilation.x() * run_info.input_stride_y;

170

}

171

172

weights_ptr += run_info.weights_stride_z;

173

input_offset += dilation.y() * run_info.input_stride_z;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

174

}

175

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

176

if(has_biases)

177

{

178

const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);

179

acc = wrapper::vadd(acc, biases_vals);

180

}

181

182

wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

183

}

184

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

185

for(; x < run_info.x_end; ++x)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

186

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

187

auto acc_scalar = T{ 0 };

188

auto weights_ptr = base_weights_ptr;

189

int64_t input_offset = base_input_offset;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

190

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

191

for(size_t h = 0; h < run_info.weights_height; ++h)

192

{

193

int64_t offs = input_offset + x * sizeof(T);

194

for(size_t w = 0; w < run_info.weights_width; ++w)

195

{

196

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

197

const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;

198

const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);

199

200

acc_scalar += (input_vals * weights_vals);

201

202

offs += dilation.x() * run_info.input_stride_y;

203

}

204

205

weights_ptr += run_info.weights_stride_z;

206

input_offset += dilation.y() * run_info.input_stride_z;

}

if(has_biases)

{

const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);

212

acc_scalar += biases_vals;

213

}

214

*(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;

215

}

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

216

},

217

input_it, weights_it, biases_it, output_it);

218

}

219

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

220

template <typename T>

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

221

void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

222

const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

223

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

224

const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

225

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

226

Window execution_window = window;

227

execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

228

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

229

Window win_input = execution_window;

230

win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));

231

win_input.set(Window::DimY, dim_manual_loop);

232

win_input.set(Window::DimZ, dim_manual_loop);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

233

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

234

Window win_weights = window;

235

win_weights.set_dimension_step(Window::DimX, run_info.x_step);

236

win_weights.set(Window::DimY, dim_manual_loop);

237

win_weights.set(Window::DimZ, dim_manual_loop);

238

win_weights.set(Window::DimW, dim_manual_loop);

239

240

Window win_output = window;

241

win_output.set_dimension_step(Window::DimX, run_info.x_step);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

242

243

Iterator input_it(input, win_input);

244

Iterator weights_it(weights, win_weights);

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

245

Iterator output_it(output, win_output);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

246

Iterator biases_it{};

if(has_biases)

{

biases_it = Iterator(biases, win_weights);

251

}

252

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

253

execute_window_loop(execution_window, [&](const Coordinates & id)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

254

{

255

std::vector<T> acc(depth_multiplier, static_cast<T>(0));

256

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

257

const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;

258

const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;

259

int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

260

261

auto weights_ptr = weights_it.ptr();

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

262

for(size_t h = 0; h < run_info.weights_height; ++h)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

263

{

264

int offs = input_offset;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

265

for(size_t w = 0; w < run_info.weights_width; ++w)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

266

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

267

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

268

const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

269

270

for(size_t m = 0; m < depth_multiplier; ++m)

271

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

272

const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));

Georgios Pinitas

1c29ffc

2019-08-01 15:03:00 +0100

[diff] [blame]

273

acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

274

}

275

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

276

offs += dilation.x() * run_info.input_stride_y;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

277

}

278

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

279

weights_ptr += run_info.weights_stride_z;

280

input_offset += dilation.y() * run_info.input_stride_z;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

}

if(has_biases)

{

for(size_t m = 0; m < depth_multiplier; ++m)

286

{

287

const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));

288

*(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;

}

}

else

{

for(size_t m = 0; m < depth_multiplier; ++m)

294

{

295

*(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);

}

}

},

input_it, weights_it, biases_it, output_it);

300

}

301

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

302

template <typename T, typename TW>

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

303

void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

304

const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

305

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

306

constexpr auto element_per_vector = vector_size / sizeof(T);

307

using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;

308

using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;

309

using AccType = int32_t;

310

using AccArrayType = std::array<AccType, element_per_vector>;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

311

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

312

const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();

313

const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});

314

315

const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

316

317

const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;

318

const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;

319

const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

320

const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;

321

322

Window execution_window = window;

323

execution_window.set(Window::DimX, dim_single_unit_step);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

324

325

Window win_input = window;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

326

win_input.set(Window::DimX, dim_manual_loop);

327

win_input.set(Window::DimY, dim_manual_loop);

328

win_input.set(Window::DimZ, dim_manual_loop);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

329

330

Window win_weights = win_input;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

331

win_weights.set(Window::DimW, dim_manual_loop);

332

333

Window win_output = window;

334

win_output.set(Window::DimX, dim_manual_loop);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

335

336

Iterator input_it(input, win_input);

337

Iterator weights_it(weights, win_weights);

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

338

Iterator output_it(output, win_output);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

339

Iterator biases_it{};

if(has_biases)

{

biases_it = Iterator(biases, win_weights);

344

}

345

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

346

execute_window_loop(execution_window, [&](const Coordinates & id)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

347

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

348

const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;

349

const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;

350

const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;

351

auto const base_weights_ptr = weights_it.ptr();

352

size_t x = run_info.x_start;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

353

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

354

for(; x < run_info.x_leftover_start; x += run_info.x_step)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

355

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

356

AccArrayType acc{};

357

AccArrayType in_sum{};

358

AccArrayType we_sum{};

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

359

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

360

auto weights_ptr = base_weights_ptr;

361

auto input_offset = base_input_offset;

362

363

for(size_t h = 0; h < run_info.weights_height; ++h)

364

{

365

int64_t offs = input_offset + x * sizeof(T);

366

for(size_t w = 0; w < run_info.weights_width; ++w)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

367

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

368

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

369

const auto input_vals = is_valid_region ?

370

wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :

371

out_of_bound_vector;

372

const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);

373

Sang-Hoon Park

1a0a4bc

2020-11-12 17:41:32 +0000

[diff] [blame]

374

for(size_t i = 0; i < element_per_vector; ++i)

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

375

{

376

acc.at(i) += input_vals[i] * weights_vals[i];

377

in_sum.at(i) += input_vals[i];

378

we_sum.at(i) += weights_vals[i];

379

}

380

381

offs += dilation.x() * run_info.input_stride_y;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

382

}

383

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

384

weights_ptr += run_info.weights_stride_z;

385

input_offset += dilation.y() * run_info.input_stride_z;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

386

}

387

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

388

VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});

Sang-Hoon Park

1a0a4bc

2020-11-12 17:41:32 +0000

[diff] [blame]

389

for(size_t i = 0; i < element_per_vector; ++i)

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

390

{

391

acc.at(i) -= in_sum.at(i) * weights_qoffset;

392

acc.at(i) -= we_sum.at(i) * input_qoffset;

393

acc.at(i) += k_offset;

if(has_biases)

{

acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);

398

}

399

400

const int32_t out_mul = output_multiplier.at(x + i);

401

const int32_t out_shift = output_shift.at(x + i);

402

if(out_shift < 0)

403

{

404

acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;

}

else

{

acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;

409

}

410

out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));

411

}

412

413

wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

414

}

415

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

416

// left-over

417

for(; x < run_info.x_end; ++x)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

418

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

AccType acc = 0;

AccType in_sum = 0;

AccType we_sum = 0;

auto weights_ptr = base_weights_ptr;

424

auto input_offset = base_input_offset;

425

426

for(size_t h = 0; h < run_info.weights_height; ++h)

427

{

428

int64_t offs = input_offset + x * sizeof(T);

429

for(size_t w = 0; w < run_info.weights_width; ++w)

430

{

431

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

432

const auto input_val = is_valid_region ?

433

*reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :

434

out_of_bound_value;

435

const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);

436

437

acc += input_val * weights_val;

438

in_sum += input_val;

439

we_sum += weights_val;

440

441

offs += dilation.x() * run_info.input_stride_y;

442

}

443

444

weights_ptr += run_info.weights_stride_z;

445

input_offset += dilation.y() * run_info.input_stride_z;

}

T out_vals{ 0 };

acc -= in_sum * weights_qoffset;

451

acc -= we_sum * input_qoffset;

452

acc += k_offset;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

453

454

if(has_biases)

455

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

456

acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

457

}

458

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

459

const int32_t out_mul = output_multiplier.at(x);

460

const int32_t out_shift = output_shift.at(x);

461

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

462

if(out_shift < 0)

463

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

464

acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

465

}

466

else

467

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

468

acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

469

}

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

470

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

471

out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));

472

*(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;

473

}

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

474

},

475

input_it, weights_it, biases_it, output_it);

476

}

477

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

478

template <typename T, typename TW>

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

479

void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

480

const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

481

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

482

using AccType = int32_t;

483

484

const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);

485

486

const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

487

488

const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;

489

const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;

490

const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

491

const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

492

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

493

Window execution_window = window;

494

execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

495

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

496

Window win_input = execution_window;

497

win_input.set(Window::DimY, dim_manual_loop);

498

win_input.set(Window::DimZ, dim_manual_loop);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

499

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

500

Window win_weights = window;

501

win_weights.set_dimension_step(Window::DimX, run_info.x_step);

502

win_weights.set(Window::DimY, dim_manual_loop);

503

win_weights.set(Window::DimZ, dim_manual_loop);

504

win_weights.set(Window::DimW, dim_manual_loop);

505

506

Window win_output = window;

507

win_output.set_dimension_step(Window::DimX, run_info.x_step);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

508

509

Iterator input_it(input, win_input);

510

Iterator weights_it(weights, win_weights);

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

511

Iterator output_it(output, win_output);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

512

Iterator biases_it{};

if(has_biases)

{

biases_it = Iterator(biases, win_weights);

517

}

518

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

519

execute_window_loop(execution_window, [&](const Coordinates & id)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

520

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

521

std::vector<AccType> acc(depth_multiplier, 0);

522

std::vector<AccType> we_sum(depth_multiplier, 0);

523

AccType in_sum = 0;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

524

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

525

const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;

526

const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;

527

int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

528

529

auto weights_ptr = weights_it.ptr();

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

530

for(size_t h = 0; h < run_info.weights_height; ++h)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

531

{

532

int offs = input_offset;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

533

for(size_t w = 0; w < run_info.weights_width; ++w)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

534

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

535

const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);

536

const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

537

538

for(size_t m = 0; m < depth_multiplier; ++m)

539

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

540

const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

541

acc.at(m) += input_val * weights_val;

542

543

we_sum.at(m) += weights_val;

544

}

545

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

546

offs += dilation.x() * run_info.input_stride_y;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

in_sum += input_val;

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

550

weights_ptr += run_info.weights_stride_z;

551

input_offset += dilation.y() * run_info.input_stride_z;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

552

}

553

554

for(size_t m = 0; m < depth_multiplier; ++m)

555

{

556

acc.at(m) -= in_sum * weights_qoffset;

557

acc.at(m) -= we_sum.at(m) * input_qoffset;

558

acc.at(m) += k_offset;

559

560

if(has_biases)

561

{

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

562

acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));

563

}

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

564

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

565

const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);

566

const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

567

if(out_shift < 0)

568

{

569

acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

570

}

571

else

572

{

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

573

acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

574

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

575

*(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

576

}

577

},

578

input_it, weights_it, biases_it, output_it);

579

}

580

Giorgio Arena

3737c79

2020-11-23 17:47:23 +0000

[diff] [blame]

581

template <typename T, typename TW>

582

void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,

583

const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)

584

{

585

constexpr int half_vec = vector_size / 2;

586

587

using AccType = int32_t;

588

using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;

589

using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;

590

using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;

591

592

const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);

593

594

const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));

595

const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));

596

const auto output_qoffset_vec = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});

597

598

const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});

599

const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});

600

const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});

601

602

const auto out_mul = output_multiplier.at(0);

603

const auto out_shift = output_shift.at(0);

604

605

Window execution_window = window;

606

execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));

607

608

Window win_input = execution_window;

609

win_input.set(Window::DimY, dim_manual_loop);

610

win_input.set(Window::DimZ, dim_manual_loop);

611

612

Window win_weights = window;

613

win_weights.set_dimension_step(Window::DimX, run_info.x_step);

614

win_weights.set(Window::DimY, dim_manual_loop);

615

win_weights.set(Window::DimZ, dim_manual_loop);

616

win_weights.set(Window::DimW, dim_manual_loop);

617

618

Window win_output = window;

619

win_output.set_dimension_step(Window::DimX, run_info.x_step);

620

621

Iterator input_it(input, win_input);

622

Iterator weights_it(weights, win_weights);

623

Iterator output_it(output, win_output);

624

Iterator biases_it{};

if(has_biases)

{

biases_it = Iterator(biases, win_weights);

629

}

630

631

std::vector<AccVectorType> acc0(depth_multiplier / vector_size);

632

std::vector<AccVectorType> acc1(depth_multiplier / vector_size);

633

634

execute_window_loop(execution_window, [&](const Coordinates & id)

635

{

636

std::fill(begin(acc0), end(acc0), zero);

637

std::fill(begin(acc1), end(acc1), zero);

638

639

const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;

640

const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;

641

int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;

642

643

auto weights_ptr = weights_it.ptr();

644

for(size_t h = 0; h < run_info.weights_height; ++h)

645

{

646

const int32_t current_h = input_z + h * dilation.y();

647

if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))

648

{

649

int offs = input_offset;

650

for(size_t w = 0; w < run_info.weights_width; ++w)

651

{

652

const int32_t current_w = input_y + w * dilation.x();

653

if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))

654

{

655

const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});

656

const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));

657

const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);

658

659

for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)

660

{

661

const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));

662

const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));

663

const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);

664

665

acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));

666

acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));

}

}

offs += dilation.x() * run_info.input_stride_y;

}

}

weights_ptr += run_info.weights_stride_z;

675

input_offset += dilation.y() * run_info.input_stride_z;

676

}

677

678

for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)

{

if(has_biases)

{

const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));

683

const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));

684

685

acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);

686

acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);

}

if(out_shift < 0)

{

acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);

692

acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);

}

else

{

acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);

697

acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);

698

}

699

700

acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);

701

acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);

702

703

const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),

704

wrapper::vmovn(acc1.at(i)));

705

706

if(std::is_same<T, uint8_t>::value)

707

{

708

wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));

}

else

{

wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));

}

}

},

input_it, weights_it, biases_it, output_it);

717

}

718

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

719

Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,

720

const Size2D &dilation)

721

{

722

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

723

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);

724

ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

725

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

726

ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

727

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());

728

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

729

ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));

730

ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));

731

ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));

732

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

733

if(is_data_type_quantized_per_channel(weights->data_type()))

734

{

735

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

736

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());

737

}

738

else

739

{

Michele Di Giorgio

f9b595a

2020-07-03 13:34:52 +0100

[diff] [blame]

740

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

741

}

742

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

743

if(biases != nullptr)

744

{

745

ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);

746

ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

747

748

if(is_data_type_quantized_asymmetric(input->data_type()))

749

{

750

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);

}

else

{

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);

755

}

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

756

}

757

758

if(output->total_size() != 0)

759

{

760

const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);

761

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);

Michele Di Giorgio

f9b595a

2020-07-03 13:34:52 +0100

[diff] [blame]

762

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

}

return Status{};

}

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

767

} // namespace

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

768

Gian Marco Iodice

bd9097d

2019-07-26 15:31:02 +0100

[diff] [blame]

769

NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

770

: _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

{

}

Gian Marco Iodice

2019-07-26 15:31:02 +0100

[diff] [blame]

774

void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,

775

const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

776

{

777

ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

778

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));

_input = input;

_weights = weights;

_biases = biases;

_output = output;

_conv_info = conv_info;

785

_depth_multiplier = depth_multiplier;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

786

_dilation = dilation;

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

787

_has_biases = (biases != nullptr);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

788

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

789

if(is_data_type_quantized(_input->info()->data_type()))

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

790

{

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

791

const auto input_scale = input->info()->quantization_info().uniform().scale;

792

const auto output_scale = output->info()->quantization_info().uniform().scale;

793

794

auto weights_scale = weights->info()->quantization_info().scale();

795

if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))

796

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

797

for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

798

{

799

weights_scale.push_back(weights_scale.front());

}

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

803

for(const auto &s : weights_scale)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

804

{

Michalis Spyrou

e7be8a0

2019-12-12 16:16:09 +0000

[diff] [blame]

805

int32_t out_mult = 0;

806

int32_t out_shift = 0;

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

807

const float multiplier = input_scale * s / output_scale;

Michele Di Giorgio

2019-10-29 10:58:13 +0000

[diff] [blame]

808

arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

809

810

_output_multiplier.push_back(out_mult);

811

_output_shift.push_back(out_shift);

}

}

switch(_weights->info()->data_type())

816

{

817

case DataType::QASYMM8:

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

818

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

819

break;

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

820

case DataType::QASYMM8_SIGNED:

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

821

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

822

break;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

823

case DataType::QSYMM8_PER_CHANNEL:

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

824

if(_input->info()->data_type() == DataType::QASYMM8)

825

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

826

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

827

}

828

else

829

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

830

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;

Michele Di Giorgio

2020-01-07 15:06:41 +0000

[diff] [blame]

831

}

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

832

break;

833

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

834

case DataType::F16:

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

835

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

836

break;

837

#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

838

case DataType::F32:

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

839

_func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

840

break;

841

default:

842

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

846

const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);

847

auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));

848

SiCongLi

b88272e

2021-02-24 15:40:57 +0000

[diff] [blame]

849

Window win = calculate_max_window(*output->info(), Steps());

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

850

INEKernel::configure(win);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

851

}

852

Gian Marco Iodice

bd9097d

2019-07-26 15:31:02 +0100

[diff] [blame]

853

Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,

854

unsigned int depth_multiplier,

855

const Size2D &dilation)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

856

{

857

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

return Status{};

}

Gian Marco Iodice

2019-07-26 15:31:02 +0100

[diff] [blame]

861

void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

862

{

863

ARM_COMPUTE_UNUSED(info);

864

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

865

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

866

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

867

(this->*_func)(window, _has_biases);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

868

}

869

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

870

template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

871

void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

872

{

873

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

874

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

875

876

if(_depth_multiplier == 1)

877

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

878

depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);

Giorgio Arena

2019-07-12 14:49:49 +0100

[diff] [blame]

879

}

880

else

881

{

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

882

depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

}

}

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

886

template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>

Michalis Spyrou

2020-05-12 16:18:33 +0100

[diff] [blame]

887

void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

888

{

889

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

890

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

891

892

if(_depth_multiplier == 1)

893

{

Sang-Hoon Park

2020-10-01 10:13:07 +0100

[diff] [blame]

894

depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);

Giorgio Arena

2019-10-15 11:09:33 +0100

[diff] [blame]

895

}

896

else

897

{

Giorgio Arena

3737c79

2020-11-23 17:47:23 +0000

[diff] [blame]

898

const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);

899

const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(_weights->info()->data_type()));

900

901

if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)

902

{

903

depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);

}

else

{

depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);

908

}

Giorgio Arena