Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2018-02-16 11:01:04 +0000

[diff] [blame]

28

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

29

#if defined(FUSED_ACTIVATION)

30

#define DATA_TYPE uchar

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

31

#ifndef VEC_SIZE

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

32

#define VEC_SIZE 8

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

33

#endif /* VEC_SIZE */

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

34

#include "activation_layer_qa8.cl"

35

#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)

36

#else /* defined(FUSED_ACTIVATION) */

37

#define ACTIVATION_FUNC(x) (x)

38

#endif /* defined(FUSED_ACTIVATION) */

39

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

40

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

41

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)

42

#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);

43

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)

44

#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));

45

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)

46

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

47

48

#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

49

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

50

#if CONV_STRIDE_X > 3

51

#error "Stride X not supported"

52

#endif /* CONV_STRIDE_X > 3 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

53

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

54

#if !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

55

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

56

#if CONV_STRIDE_X == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

57

#define GET_VALUES(first_value, left, middle, right) \

58

({ \

59

int8 temp0 = CONVERT(vload8(0, first_value), int8); \

60

int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \

61

\

62

left = CONVERT(temp0.s01234567, int8); \

63

middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \

64

right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \

65

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

66

#elif CONV_STRIDE_X == 2

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

67

#define GET_VALUES(first_value, left, middle, right) \

68

({ \

69

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

70

int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \

71

\

72

left = CONVERT(temp0.s02468ace, int8); \

73

middle = CONVERT(temp0.s13579bdf, int8); \

74

right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \

75

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

76

#else /* CONV_STRIDE_X */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

77

#define GET_VALUES(first_value, left, middle, right) \

78

({ \

79

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

80

int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \

81

\

82

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

83

middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \

84

right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \

85

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

86

#endif /* CONV_STRIDE_X */

87

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

88

/** This function computes the depthwise convolution quantized.

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

89

*

90

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

91

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

92

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

93

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

94

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

95

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

96

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

97

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

98

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

99

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

100

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

101

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

102

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

103

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

104

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

105

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

106

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

107

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

108

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

109

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

110

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

111

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

112

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

113

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

114

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

115

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

116

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

117

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

118

*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

119

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

120

__kernel void depthwise_convolution_3x3_quantized_nchw(

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

121

TENSOR3D_DECLARATION(src),

122

TENSOR3D_DECLARATION(dst),

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

123

TENSOR3D_DECLARATION(weights)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

124

#if defined(HAS_BIAS)

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

125

,

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

126

VECTOR_DECLARATION(biases)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

127

#endif //defined(HAS_BIAS)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

128

)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

129

{

130

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

131

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

132

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

133

#if defined(HAS_BIAS)

134

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

135

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

136

int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));

137

#endif //defined(HAS_BIAS)

138

Giorgio Arena

7657224

2018-04-04 17:44:26 +0100

[diff] [blame]

139

src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

140

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

141

uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);

142

uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);

143

uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

144

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

145

int8 values0 = 0;

146

int8 sum0 = 0;

147

#if CONV_STRIDE_Y == 1

148

int8 values1 = 0;

149

int8 sum1 = 0;

150

#endif /* CONV_STRIDE_Y */

151

152

// Row0

153

int8 left, middle, right;

154

GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);

155

values0 += left * (int8)(w0.s0);

156

values0 += middle * (int8)(w0.s1);

157

values0 += right * (int8)(w0.s2);

158

159

#if WEIGHTS_OFFSET != 0

160

sum0 += left + middle + right;

161

#endif /* WEIGHTS_OFFSET != 0 */

162

163

// Row1

164

GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);

165

values0 += left * (int8)(w1.s0);

166

values0 += middle * (int8)(w1.s1);

167

values0 += right * (int8)(w1.s2);

168

#if CONV_STRIDE_Y == 1

169

values1 += left * (int8)(w0.s0);

170

values1 += middle * (int8)(w0.s1);

171

values1 += right * (int8)(w0.s2);

172

#endif /* CONV_STRIDE_Y == 1 */

173

174

#if WEIGHTS_OFFSET != 0

175

int8 tmp = left + middle + right;

176

sum0 += tmp;

177

#if CONV_STRIDE_Y == 1

178

sum1 += tmp;

179

#endif /* CONV_STRIDE_Y == 1 */

180

#endif /* WEIGHTS_OFFSET != 0 */

181

182

// Row2

183

GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);

184

values0 += left * (int8)(w2.s0);

185

values0 += middle * (int8)(w2.s1);

186

values0 += right * (int8)(w2.s2);

187

#if CONV_STRIDE_Y == 1

188

values1 += left * (int8)(w1.s0);

189

values1 += middle * (int8)(w1.s1);

190

values1 += right * (int8)(w1.s2);

191

#endif /* CONV_STRIDE_Y == 1 */

192

193

#if WEIGHTS_OFFSET != 0

194

tmp = left + middle + right;

195

sum0 += tmp;

196

#if CONV_STRIDE_Y == 1

197

sum1 += tmp;

198

#endif /* CONV_STRIDE_Y == 1 */

199

#endif /* WEIGHTS_OFFSET != 0 */

200

201

#if CONV_STRIDE_Y == 1

202

// Row3

203

GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);

204

values1 += left * (int8)(w2.s0);

205

values1 += middle * (int8)(w2.s1);

206

values1 += right * (int8)(w2.s2);

207

208

#if WEIGHTS_OFFSET != 0

209

sum1 += left + middle + right;

210

#endif /* WEIGHTS_OFFSET != 0 */

211

#endif /* CONV_STRIDE_Y == 1 */

212

213

#if defined(HAS_BIAS)

214

values0 += (int8)(bias_value);

215

#if CONV_STRIDE_Y == 1

216

values1 += (int8)(bias_value);

217

#endif /* CONV_STRIDE_Y == 1 */

218

#endif //defined(HAS_BIAS)

219

220

#if WEIGHTS_OFFSET != 0

221

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

222

#if CONV_STRIDE_Y == 1

223

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

224

#endif /* CONV_STRIDE_Y == 1 */

225

#endif /* WEIGHTS_OFFSET != 0 */

226

227

#if INPUT_OFFSET != 0

228

ushort sum_weights = 0;

229

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

230

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

231

values0 += sum_weights * (int8)(INPUT_OFFSET);

232

#if CONV_STRIDE_Y == 1

233

values1 += sum_weights * (int8)(INPUT_OFFSET);

234

#endif /* CONV_STRIDE_Y == 1 */

235

#endif /* INPUT_OFFSET != 0 */

236

237

#if K_OFFSET != 0

238

values0 += (int8)(K_OFFSET);

239

#if CONV_STRIDE_Y == 1

240

values1 += (int8)(K_OFFSET);

241

#endif /* CONV_STRIDE_Y == 1 */

242

#endif /* K_OFFSET != 0 */

243

244

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

245

values0 += (int8)OUTPUT_OFFSET;

246

uchar8 res0 = convert_uchar8_sat(values0);

247

res0 = max(res0, (uchar8)0);

248

res0 = min(res0, (uchar8)255);

249

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

250

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

251

#if CONV_STRIDE_Y == 1

252

253

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

254

values1 += (int8)OUTPUT_OFFSET;

255

uchar8 res1 = convert_uchar8_sat(values1);

256

res1 = max(res1, (uchar8)0);

257

res1 = min(res1, (uchar8)255);

258

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

259

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

260

#endif /* CONV_STRIDE_Y == 1 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

261

}

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

262

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

263

#else // !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

264

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

265

#if CONV_STRIDE_X == 1

266

#define GET_VALUES(first_value, left, middle, right) \

267

({ \

268

uchar8 temp0 = vload8(0, first_value); \

269

uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \

270

\

271

left = temp0.s01234567; \

272

middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \

273

right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

274

})

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

275

#elif CONV_STRIDE_X == 2

276

#define GET_VALUES(first_value, left, middle, right) \

277

({ \

278

uchar16 temp0 = vload16(0, first_value); \

279

uchar temp1 = *(first_value + 16 * sizeof(uchar)); \

280

\

281

left = temp0.s02468ace; \

282

middle = temp0.s13579bdf; \

283

right = (uchar8)(temp0.s2468, temp0.sace, temp1); \

284

})

285

#else /* CONV_STRIDE_X */

286

#define GET_VALUES(first_value, left, middle, right) \

287

({ \

288

uchar16 temp0 = vload16(0, first_value); \

289

uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \

290

\

291

left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \

292

middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \

293

right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \

294

})

295

#endif /* CONV_STRIDE_X */

296

/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

297

*

298

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

299

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

300

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

301

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

302

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

303

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

304

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

305

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

306

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

307

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

308

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

309

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

310

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

311

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

312

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

313

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

314

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

315

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

316

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

317

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

318

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

319

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

320

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

321

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

322

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

323

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

324

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

325

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

326

*/

327

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

328

__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(

329

TENSOR3D_DECLARATION(src),

330

TENSOR3D_DECLARATION(dst),

331

TENSOR3D_DECLARATION(weights)

332

#if defined(HAS_BIAS)

333

,

334

VECTOR_DECLARATION(biases)

335

#endif //defined(HAS_BIAS)

336

)

337

{

338

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

339

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

340

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

341

#if defined(HAS_BIAS)

342

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

343

344

const int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));

345

#endif //defined(HAS_BIAS)

346

347

src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

348

349

uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);

350

uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);

351

uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);

352

353

uchar8 left0, middle0, right0;

354

uchar8 left1, middle1, right1;

355

uchar8 left2, middle2, right2;

int8 values0 = 0;

int8 sum0 = 0;

GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);

361

GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);

362

GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);

363

364

#if WEIGHTS_OFFSET != 0

365

sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);

366

sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

367

sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

368

#endif /* WEIGHTS_OFFSET != 0 */

369

370

#if CONV_STRIDE_Y == 1

371

// If conv_stride_y is equals to 1, we compute two output rows

372

373

uchar8 left3, middle3, right3;

int8 values1 = 0;

int8 sum1 = 0;

GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);

378

379

#if WEIGHTS_OFFSET != 0

380

sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

381

sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

382

sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);

383

#endif /* WEIGHTS_OFFSET != 0 */

384

#endif // CONV_STRIDE_Y == 1

385

386

ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0);

387

ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0);

388

values0.s0 += right2.s0 * w2.s2;

389

390

ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1);

391

ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1);

392

values0.s1 += right2.s1 * w2.s2;

393

394

ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2);

395

ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2);

396

values0.s2 += right2.s2 * w2.s2;

397

398

ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3);

399

ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3);

400

values0.s3 += right2.s3 * w2.s2;

401

402

ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4);

403

ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4);

404

values0.s4 += right2.s4 * w2.s2;

405

406

ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5);

407

ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5);

408

values0.s5 += right2.s5 * w2.s2;

409

410

ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6);

411

ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6);

412

values0.s6 += right2.s6 * w2.s2;

413

414

ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7);

415

ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7);

416

values0.s7 += right2.s7 * w2.s2;

417

418

#if CONV_STRIDE_Y == 1

419

ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0);

420

ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0);

421

values1.s0 += right3.s0 * w2.s2;

422

423

ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1);

424

ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1);

425

values1.s1 += right3.s1 * w2.s2;

426

427

ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2);

428

ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2);

429

values1.s2 += right3.s2 * w2.s2;

430

431

ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3);

432

ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3);

433

values1.s3 += right3.s3 * w2.s2;

434

435

ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4);

436

ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4);

437

values1.s4 += right3.s4 * w2.s2;

438

439

ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5);

440

ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5);

441

values1.s5 += right3.s5 * w2.s2;

442

443

ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6);

444

ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6);

445

values1.s6 += right3.s6 * w2.s2;

446

447

ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7);

448

ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7);

449

values1.s7 += right3.s7 * w2.s2;

450

#endif // CONV_STRIDE_Y == 1

451

452

#if defined(HAS_BIAS)

453

values0 += (int8)(bias_value);

454

#if CONV_STRIDE_Y == 1

455

values1 += (int8)(bias_value);

456

#endif /* CONV_STRIDE_Y == 1 */

457

#endif //defined(HAS_BIAS)

458

459

#if WEIGHTS_OFFSET != 0

460

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

461

#if CONV_STRIDE_Y == 1

462

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

463

#endif /* CONV_STRIDE_Y == 1 */

464

#endif /* WEIGHTS_OFFSET != 0 */

465

466

#if INPUT_OFFSET != 0

467

ushort sum_weights = 0;

468

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

469

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

470

values0 += sum_weights * (int8)(INPUT_OFFSET);

471

#if CONV_STRIDE_Y == 1

472

values1 += sum_weights * (int8)(INPUT_OFFSET);

473

#endif /* CONV_STRIDE_Y == 1 */

474

#endif /* INPUT_OFFSET != 0 */

475

476

#if K_OFFSET != 0

477

values0 += (int8)(K_OFFSET);

478

#if CONV_STRIDE_Y == 1

479

values1 += (int8)(K_OFFSET);

480

#endif /* CONV_STRIDE_Y == 1 */

481

#endif /* K_OFFSET != 0 */

482

483

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

484

values0 += (int8)OUTPUT_OFFSET;

485

uchar8 res0 = convert_uchar8_sat(values0);

486

res0 = max(res0, (uchar8)0);

487

res0 = min(res0, (uchar8)255);

488

489

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

490

#if CONV_STRIDE_Y == 1

491

492

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

493

values1 += (int8)OUTPUT_OFFSET;

494

uchar8 res1 = convert_uchar8_sat(values1);

495

res1 = max(res1, (uchar8)0);

496

res1 = min(res1, (uchar8)255);

497

498

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

499

#endif /* CONV_STRIDE_Y == 1 */

500

}

501

502

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

503

504

#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) */

505

506

#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

507

508

#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)

509

510

#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)

511

#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)

512

#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)

513

514

#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)

515

516

#if WEIGHTS_OFFSET != 0

517

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

518

({ \

519

sum += CONVERT(x, VEC_INT); \

520

MULTIPLY_ADD(x, y, acc); \

521

})

522

#else /* WEIGHTS_OFFSET != 0 */

523

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

524

#endif /* WEIGHTS_OFFSET != 0 */

525

526

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

527

#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

528

({ \

529

ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \

530

ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0); \

531

acc.s0 += val8.s0 * w8.s0; \

532

\

533

ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1); \

534

ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1); \

535

acc.s1 += val8.s1 * w8.s1; \

536

\

537

ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2); \

538

ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2); \

539

acc.s2 += val8.s2 * w8.s2; \

540

\

541

ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3); \

542

ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3); \

543

acc.s3 += val8.s3 * w8.s3; \

544

})

545

546

#if WEIGHTS_OFFSET != 0

547

#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

548

({ \

549

sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \

550

DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8); \

551

})

552

#else /* WEIGHTS_OFFSET != 0 */

553

#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)

554

#endif /* WEIGHTS_OFFSET != 0 */

555

556

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)

557

558

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

559

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.

560

*

561

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

562

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

563

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

564

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

565

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

566

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

567

*

568

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

569

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

570

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

571

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

572

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

573

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

574

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

576

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

577

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

578

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

579

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

580

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

581

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

582

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

583

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

584

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

585

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

586

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

587

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

588

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

589

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

590

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

591

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

592

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

593

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

594

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

595

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

596

* @param[in] max_offset Max offset for the input tensor

597

*/

598

__kernel void depthwise_convolution_3x3_quantized_nhwc(

599

TENSOR3D_DECLARATION(src),

600

TENSOR3D_DECLARATION(dst),

601

TENSOR3D_DECLARATION(weights),

602

#if defined(HAS_BIAS)

603

VECTOR_DECLARATION(biases),

604

#endif /* defined(HAS_BIAS) */

605

int max_offset)

606

{

607

const int x = get_global_id(0); // channels

608

const int y = get_global_id(1); // spatial coordinate x

609

const int z = get_global_id(2); // spatial coordinate y

610

611

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

612

613

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

614

615

int z_coord = 0;

616

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

617

const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

618

619

// We compute 2x1x1 [C,W,H] elements

620

VEC_INT acc = 0, sum = 0;

621

622

// Load weights

623

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

624

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

625

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

626

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

627

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

628

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

629

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

630

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

631

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

632

633

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

634

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

635

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

636

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

637

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

642

// z_coord is casted to unsigned int in order to use just a min() operation

643

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

644

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

645

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

646

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

647

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

648

649

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

650

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

651

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

652

653

// z == 1

654

// z_coord can be only negative for z = 0 so we do not need to clamp it

655

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

656

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

657

offset = y_offset + (int4)(z_coord * src_stride_z);

658

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

659

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

660

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

661

662

// z == 2

663

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

664

// However offset can be out-of-bound so we need to check if it is greater than max_offset

665

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

666

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

667

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

668

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

669

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

670

671

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);

672

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);

673

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);

674

675

MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);

676

MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);

677

MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);

678

679

MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);

680

MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);

681

MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);

682

683

#if defined(HAS_BIAS)

684

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

685

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

686

acc += bias_values;

687

#endif // defined(HAS_BIAS)

688

689

#if WEIGHTS_OFFSET != 0

690

acc += WEIGHTS_OFFSET * sum;

691

#endif /* WEIGHTS_OFFSET != 0 */

692

693

#if INPUT_OFFSET != 0

694

acc += INPUT_OFFSET * sum_we;

695

#endif /* INPUT_OFFSET != 0 */

696

697

#if K_OFFSET != 0

698

acc += (VEC_INT)K_OFFSET;

699

#endif /* K_OFFSET != 0 */

700

701

acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

702

acc += (VEC_INT)OUTPUT_OFFSET;

703

704

VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);

705

res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);

706

707

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

VSTORE(VEC_SIZE)

(res, 0, dst.ptr);

}

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

712

713

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

714

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1

715

*

716

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

717

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

718

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

719

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

720

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

721

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

722

*

723

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

724

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

725

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

726

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

727

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

728

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

729

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

730

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

731

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

732

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

733

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

734

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

735

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

736

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

737

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

738

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

739

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

740

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

741

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

742

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

743

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

744

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

745

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

746

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

747

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

748

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

749

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

750

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

751

* @param[in] max_offset Max offset for the input tensor

752

*/

753

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

754

__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(

755

TENSOR3D_DECLARATION(src),

756

TENSOR3D_DECLARATION(dst),

757

TENSOR3D_DECLARATION(weights),

758

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

759

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

760

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

761

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

762

{

Giorgio Arena

fa23f11

2018-06-19 11:27:38 +0100

[diff] [blame]

763

int x = get_global_id(0);

764

int y = get_global_id(1);

765

int z = get_global_id(2);

766

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

767

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

768

769

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

770

771

int z_coord = 0;

772

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

773

int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

774

775

// We compute 2x2x2 [C,W,H] elements

776

VEC_INT acc0 = 0, sum0 = 0;

777

VEC_INT acc1 = 0, sum1 = 0;

778

VEC_INT acc2 = 0, sum2 = 0;

779

VEC_INT acc3 = 0, sum3 = 0;

780

781

// Load weights

782

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

783

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

784

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

785

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

786

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

787

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

788

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

789

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

790

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

791

792

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

793

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

794

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

795

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

796

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

801

// z_coord is casted to unsigned int in order to use just a min() operation

802

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

803

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

804

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

805

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

806

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

807

808

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

809

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

810

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

811

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

812

813

// z == 1

814

// z_coord can be only negative for z = 0 so we do not need to clamp it

815

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

816

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

817

offset = y_offset + (int4)(z_coord * src_stride_z);

818

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

819

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

820

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

821

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

822

823

// z == 2

824

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

825

// However offset can be out-of-bound so we need to check if it is greater than max_offset

826

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

827

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

828

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

829

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

830

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

831

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

832

833

// z == 3

834

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

835

// However offset can be out-of-bound so we need to check if it is greater than max_offset

836

offset += (int4)(src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

837

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

838

VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

839

VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

840

VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

841

VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

842

843

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);

844

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);

845

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);

846

MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);

847

MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);

848

MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);

849

850

MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);

851

MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);

852

MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);

853

MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);

854

MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);

855

MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);

856

857

MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);

858

MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);

859

MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);

860

MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);

861

MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);

862

MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);

863

864

MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);

865

MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);

866

MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);

867

MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);

868

MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);

869

MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);

870

871

MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);

872

MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);

873

MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);

874

MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);

875

MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);

876

MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);

877

878

MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);

879

MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);

880

MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);

881

MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);

882

MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);

883

MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);

884

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

885

#if defined(HAS_BIAS)

886

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

887

888

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

889

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

895

896

#if WEIGHTS_OFFSET != 0

897

acc0 += WEIGHTS_OFFSET * sum0;

898

acc1 += WEIGHTS_OFFSET * sum1;

899

acc2 += WEIGHTS_OFFSET * sum2;

900

acc3 += WEIGHTS_OFFSET * sum3;

901

#endif /* WEIGHTS_OFFSET != 0 */

902

903

#if INPUT_OFFSET != 0

904

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

911

912

#if K_OFFSET != 0

913

acc0 += (VEC_INT)K_OFFSET;

914

acc1 += (VEC_INT)K_OFFSET;

915

acc2 += (VEC_INT)K_OFFSET;

916

acc3 += (VEC_INT)K_OFFSET;

917

#endif /* K_OFFSET != 0 */

918

919

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

920

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

921

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

922

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

923

924

acc0 += (VEC_INT)OUTPUT_OFFSET;

925

acc1 += (VEC_INT)OUTPUT_OFFSET;

926

acc2 += (VEC_INT)OUTPUT_OFFSET;

927

acc3 += (VEC_INT)OUTPUT_OFFSET;

928

929

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

930

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

931

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

932

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

933

934

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

935

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

936

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

937

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

938

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

939

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

940

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

941

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

942

(res0, 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

943

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

944

(res1, 0, dst_addr + 1 * dst_stride_y);

945

946

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

947

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

948

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

949

{

950

VSTORE(VEC_SIZE)

951

(res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);

952

VSTORE(VEC_SIZE)

953

(res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);

954

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

955

}

956

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

957

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

958

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product

959

*

960

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

961

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

962

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

963

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

964

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

965

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

966

*

967

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

968

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

969

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

970

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

971

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

972

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

973

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

974

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

975

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

976

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

977

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

978

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

979

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

980

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

981

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

982

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

983

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

984

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

985

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

986

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

987

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

988

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

989

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

990

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

991

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

992

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

993

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

994

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

995

*/

996

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

997

__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

998

TENSOR3D_DECLARATION(src),

999

TENSOR3D_DECLARATION(dst),

1000

TENSOR3D_DECLARATION(weights),

1001

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1002

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1003

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1004

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1005

{

Giorgio Arena

fa23f11

2018-06-19 11:27:38 +0100

[diff] [blame]

1006

int x = get_global_id(0);

1007

int y = get_global_id(1);

1008

int z = get_global_id(2);

1009

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1010

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1011

1012

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

1013

1014

int z_coord = 0;

1015

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1016

int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1017

1018

// We compute 2x2x2 [C,W,H] elements

1019

VEC_INT acc0 = 0, sum0 = 0;

1020

VEC_INT acc1 = 0, sum1 = 0;

1021

VEC_INT acc2 = 0, sum2 = 0;

1022

VEC_INT acc3 = 0, sum3 = 0;

1023

1024

// Load weights

1025

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

1026

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

1027

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

1028

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

1029

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

1030

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

1031

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

1032

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

1033

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

1034

1035

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1036

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

1037

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

1038

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1039

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1044

// z_coord is casted to unsigned int in order to use just a min() operation

1045

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1046

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1047

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1048

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1049

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1050

1051

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1052

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1053

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1054

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1055

1056

// z == 1

1057

// z_coord can be only negative for z = 0 so we do not need to clamp it

1058

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1059

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1060

offset = y_offset + (int4)(z_coord * src_stride_z);

1061

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1062

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1063

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1064

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1065

1066

// z == 2

1067

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1068

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1069

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1070

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1071

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1072

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1073

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1074

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1075

1076

// z == 3

1077

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1078

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1079

offset += (int4)(src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1080

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1081

VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1082

VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1083

VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1084

VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1085

1086

DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1087

DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1088

DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1089

DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1090

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1091

#if defined(HAS_BIAS)

1092

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1093

1094

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1095

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1096

acc0 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1097

acc1 += bias_values;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1098

acc2 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1099

acc3 += bias_values;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1100

#endif /* defined(HAS_BIAS) */

1101

1102

#if WEIGHTS_OFFSET != 0

1103

acc0 += WEIGHTS_OFFSET * sum0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1104

acc1 += WEIGHTS_OFFSET * sum1;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1105

acc2 += WEIGHTS_OFFSET * sum2;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1106

acc3 += WEIGHTS_OFFSET * sum3;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1107

#endif /* WEIGHTS_OFFSET != 0 */

1108

1109

#if INPUT_OFFSET != 0

1110

VEC_INT offs = INPUT_OFFSET * sum_we;

1111

1112

acc0 += offs;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1113

acc1 += offs;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1114

acc2 += offs;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1115

acc3 += offs;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1116

#endif /* INPUT_OFFSET != 0 */

1117

1118

#if K_OFFSET != 0

1119

acc0 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1120

acc1 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1121

acc2 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1122

acc3 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1123

#endif /* K_OFFSET != 0 */

1124

1125

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1126

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1127

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1128

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1129

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1130

acc0 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1131

acc1 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1132

acc2 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1133

acc3 += (VEC_INT)OUTPUT_OFFSET;

1134

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1135

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1136

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1137

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1138

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

1139

1140

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

1141

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

1142

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

1143

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

1144

1145

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1146

1147

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1148

(res0, 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1149

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1150

(res1, 0, dst_addr + 1 * dst_stride_y);

1151

1152

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1153

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

1154

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1155

{

1156

VSTORE(VEC_SIZE)

1157

(res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);

1158

VSTORE(VEC_SIZE)

1159

(res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);

1160

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1161

}

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1162

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1163

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1164

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1165

Giorgio Arena