Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2018-02-16 11:01:04 +0000

[diff] [blame]

28

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

29

#if defined(FUSED_ACTIVATION)

30

#define DATA_TYPE uchar

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

31

#ifndef VEC_SIZE

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

32

#define VEC_SIZE 8

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

33

#endif /* VEC_SIZE */

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

34

#include "activation_layer_qa8.cl"

35

#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)

36

#else /* defined(FUSED_ACTIVATION) */

37

#define ACTIVATION_FUNC(x) (x)

38

#endif /* defined(FUSED_ACTIVATION) */

39

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

40

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

41

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

42

#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

43

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

44

#define ARM_DOT(x, y, val) val += arm_dot((x), (y));

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

45

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

46

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

47

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

48

#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

49

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

50

#if CONV_STRIDE_X > 3

51

#error "Stride X not supported"

52

#endif /* CONV_STRIDE_X > 3 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

53

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

54

#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

55

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

56

#if CONV_STRIDE_X == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

57

#define GET_VALUES(first_value, left, middle, right) \

58

({ \

59

int8 temp0 = CONVERT(vload8(0, first_value), int8); \

60

int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \

61

\

62

left = CONVERT(temp0.s01234567, int8); \

63

middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \

64

right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \

65

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

66

#elif CONV_STRIDE_X == 2

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

67

#define GET_VALUES(first_value, left, middle, right) \

68

({ \

69

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

70

int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \

71

\

72

left = CONVERT(temp0.s02468ace, int8); \

73

middle = CONVERT(temp0.s13579bdf, int8); \

74

right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \

75

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

76

#else /* CONV_STRIDE_X */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

77

#define GET_VALUES(first_value, left, middle, right) \

78

({ \

79

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

80

int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \

81

\

82

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

83

middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \

84

right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \

85

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

86

#endif /* CONV_STRIDE_X */

87

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

88

/** This function computes the depthwise convolution quantized.

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

89

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

90

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

91

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

92

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

93

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

94

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

95

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

96

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

97

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

98

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

99

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

100

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

101

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

102

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

103

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

104

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

105

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

106

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

107

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

108

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

109

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

110

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

111

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

112

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

113

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

114

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

115

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

116

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

117

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Anthony Barbier

2017-11-23 18:02:04 +0000

[diff] [blame]

118

*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

119

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

120

__kernel void depthwise_convolution_3x3_quantized_nchw(

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

121

TENSOR3D_DECLARATION(src),

122

TENSOR3D_DECLARATION(dst),

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

123

TENSOR3D_DECLARATION(weights)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

124

#if defined(HAS_BIAS)

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

125

,

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

126

VECTOR_DECLARATION(biases)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

127

#endif //defined(HAS_BIAS)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

128

)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

129

{

130

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

131

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

132

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

133

134

// Extract channel and linearized batch indices

135

const int channel = get_global_id(2) % DST_CHANNELS;

136

const int batch = get_global_id(2) / DST_CHANNELS;

137

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

138

#if defined(HAS_BIAS)

139

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

140

Georgios Pinitas

728d3cf

2018-09-21 13:41:35 +0100

[diff] [blame]

141

int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

142

#endif //defined(HAS_BIAS)

143

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

144

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

145

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

146

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

7657224

2018-04-04 17:44:26 +0100

[diff] [blame]

147

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

148

uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);

149

uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);

150

uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

151

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

152

int8 values0 = 0;

153

int8 sum0 = 0;

154

#if CONV_STRIDE_Y == 1

155

int8 values1 = 0;

156

int8 sum1 = 0;

157

#endif /* CONV_STRIDE_Y */

158

159

// Row0

160

int8 left, middle, right;

161

GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);

162

values0 += left * (int8)(w0.s0);

163

values0 += middle * (int8)(w0.s1);

164

values0 += right * (int8)(w0.s2);

165

166

#if WEIGHTS_OFFSET != 0

167

sum0 += left + middle + right;

168

#endif /* WEIGHTS_OFFSET != 0 */

169

170

// Row1

171

GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);

172

values0 += left * (int8)(w1.s0);

173

values0 += middle * (int8)(w1.s1);

174

values0 += right * (int8)(w1.s2);

175

#if CONV_STRIDE_Y == 1

176

values1 += left * (int8)(w0.s0);

177

values1 += middle * (int8)(w0.s1);

178

values1 += right * (int8)(w0.s2);

179

#endif /* CONV_STRIDE_Y == 1 */

180

181

#if WEIGHTS_OFFSET != 0

182

int8 tmp = left + middle + right;

183

sum0 += tmp;

184

#if CONV_STRIDE_Y == 1

185

sum1 += tmp;

186

#endif /* CONV_STRIDE_Y == 1 */

187

#endif /* WEIGHTS_OFFSET != 0 */

188

189

// Row2

190

GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);

191

values0 += left * (int8)(w2.s0);

192

values0 += middle * (int8)(w2.s1);

193

values0 += right * (int8)(w2.s2);

194

#if CONV_STRIDE_Y == 1

195

values1 += left * (int8)(w1.s0);

196

values1 += middle * (int8)(w1.s1);

197

values1 += right * (int8)(w1.s2);

198

#endif /* CONV_STRIDE_Y == 1 */

199

200

#if WEIGHTS_OFFSET != 0

201

tmp = left + middle + right;

202

sum0 += tmp;

203

#if CONV_STRIDE_Y == 1

204

sum1 += tmp;

205

#endif /* CONV_STRIDE_Y == 1 */

206

#endif /* WEIGHTS_OFFSET != 0 */

207

208

#if CONV_STRIDE_Y == 1

209

// Row3

210

GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);

211

values1 += left * (int8)(w2.s0);

212

values1 += middle * (int8)(w2.s1);

213

values1 += right * (int8)(w2.s2);

214

215

#if WEIGHTS_OFFSET != 0

216

sum1 += left + middle + right;

217

#endif /* WEIGHTS_OFFSET != 0 */

218

#endif /* CONV_STRIDE_Y == 1 */

219

220

#if defined(HAS_BIAS)

221

values0 += (int8)(bias_value);

222

#if CONV_STRIDE_Y == 1

223

values1 += (int8)(bias_value);

224

#endif /* CONV_STRIDE_Y == 1 */

225

#endif //defined(HAS_BIAS)

226

227

#if WEIGHTS_OFFSET != 0

228

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

229

#if CONV_STRIDE_Y == 1

230

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

231

#endif /* CONV_STRIDE_Y == 1 */

232

#endif /* WEIGHTS_OFFSET != 0 */

233

234

#if INPUT_OFFSET != 0

235

ushort sum_weights = 0;

236

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

237

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

238

values0 += sum_weights * (int8)(INPUT_OFFSET);

239

#if CONV_STRIDE_Y == 1

240

values1 += sum_weights * (int8)(INPUT_OFFSET);

241

#endif /* CONV_STRIDE_Y == 1 */

242

#endif /* INPUT_OFFSET != 0 */

243

244

#if K_OFFSET != 0

245

values0 += (int8)(K_OFFSET);

246

#if CONV_STRIDE_Y == 1

247

values1 += (int8)(K_OFFSET);

248

#endif /* CONV_STRIDE_Y == 1 */

249

#endif /* K_OFFSET != 0 */

250

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

251

#if defined(REAL_MULTIPLIER)

252

253

values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);

254

255

#else // defined(REAL_MULTIPLIER)

256

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

257

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

258

259

#endif // defined(REAL_MULTIPLIER)

260

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

261

values0 += (int8)OUTPUT_OFFSET;

262

uchar8 res0 = convert_uchar8_sat(values0);

263

res0 = max(res0, (uchar8)0);

264

res0 = min(res0, (uchar8)255);

265

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

266

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

267

#if CONV_STRIDE_Y == 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

268

#if defined(REAL_MULTIPLIER)

269

270

values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);

271

272

#else // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

273

274

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

275

276

#endif // defined(REAL_MULTIPLIER)

277

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

278

values1 += (int8)OUTPUT_OFFSET;

279

uchar8 res1 = convert_uchar8_sat(values1);

280

res1 = max(res1, (uchar8)0);

281

res1 = min(res1, (uchar8)255);

282

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

283

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

284

#endif /* CONV_STRIDE_Y == 1 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

285

}

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

286

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

287

#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

288

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

289

#if CONV_STRIDE_X == 1

290

#define GET_VALUES(first_value, left, middle, right) \

291

({ \

292

uchar8 temp0 = vload8(0, first_value); \

293

uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \

294

\

295

left = temp0.s01234567; \

296

middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \

297

right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

298

})

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

299

#elif CONV_STRIDE_X == 2

300

#define GET_VALUES(first_value, left, middle, right) \

301

({ \

302

uchar16 temp0 = vload16(0, first_value); \

303

uchar temp1 = *(first_value + 16 * sizeof(uchar)); \

304

\

305

left = temp0.s02468ace; \

306

middle = temp0.s13579bdf; \

307

right = (uchar8)(temp0.s2468, temp0.sace, temp1); \

308

})

309

#else /* CONV_STRIDE_X */

310

#define GET_VALUES(first_value, left, middle, right) \

311

({ \

312

uchar16 temp0 = vload16(0, first_value); \

313

uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \

314

\

315

left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \

316

middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \

317

right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \

318

})

319

#endif /* CONV_STRIDE_X */

320

/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

321

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

322

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

323

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

324

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

325

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

326

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

327

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

328

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

329

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

330

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

331

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

332

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

333

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

334

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

335

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

336

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

337

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

338

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

339

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

340

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

341

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

342

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

343

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

344

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

345

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

346

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

347

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

348

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

349

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

350

*/

351

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

352

__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(

353

TENSOR3D_DECLARATION(src),

354

TENSOR3D_DECLARATION(dst),

355

TENSOR3D_DECLARATION(weights)

356

#if defined(HAS_BIAS)

357

,

358

VECTOR_DECLARATION(biases)

359

#endif //defined(HAS_BIAS)

360

)

361

{

362

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

363

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

364

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

365

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

366

// Extract channel and linearized batch indices

367

const int channel = get_global_id(2) % DST_CHANNELS;

368

const int batch = get_global_id(2) / DST_CHANNELS;

369

370

#if defined(HAS_BIAS)

371

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

372

373

const int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

374

#endif //defined(HAS_BIAS)

375

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

376

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

377

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

378

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

379

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

380

uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);

381

uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);

382

uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

383

384

uchar8 left0, middle0, right0;

385

uchar8 left1, middle1, right1;

386

uchar8 left2, middle2, right2;

int8 values0 = 0;

int8 sum0 = 0;

GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);

392

GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);

393

GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);

394

395

#if WEIGHTS_OFFSET != 0

396

sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);

397

sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

398

sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

399

#endif /* WEIGHTS_OFFSET != 0 */

400

401

#if CONV_STRIDE_Y == 1

402

// If conv_stride_y is equals to 1, we compute two output rows

403

404

uchar8 left3, middle3, right3;

int8 values1 = 0;

int8 sum1 = 0;

GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);

409

410

#if WEIGHTS_OFFSET != 0

411

sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

412

sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

413

sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);

414

#endif /* WEIGHTS_OFFSET != 0 */

415

#endif // CONV_STRIDE_Y == 1

416

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

417

ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);

418

ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

419

values0.s0 += right2.s0 * w2.s2;

420

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

421

ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);

422

ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

423

values0.s1 += right2.s1 * w2.s2;

424

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

425

ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);

426

ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

427

values0.s2 += right2.s2 * w2.s2;

428

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

429

ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);

430

ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

431

values0.s3 += right2.s3 * w2.s2;

432

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

433

ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);

434

ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

435

values0.s4 += right2.s4 * w2.s2;

436

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

437

ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);

438

ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

439

values0.s5 += right2.s5 * w2.s2;

440

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

441

ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);

442

ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

443

values0.s6 += right2.s6 * w2.s2;

444

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

445

ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);

446

ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

447

values0.s7 += right2.s7 * w2.s2;

448

449

#if CONV_STRIDE_Y == 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

450

ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);

451

ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

452

values1.s0 += right3.s0 * w2.s2;

453

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

454

ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);

455

ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

456

values1.s1 += right3.s1 * w2.s2;

457

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

458

ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);

459

ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

460

values1.s2 += right3.s2 * w2.s2;

461

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

462

ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);

463

ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

464

values1.s3 += right3.s3 * w2.s2;

465

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

466

ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);

467

ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

468

values1.s4 += right3.s4 * w2.s2;

469

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

470

ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);

471

ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

472

values1.s5 += right3.s5 * w2.s2;

473

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

474

ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);

475

ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

476

values1.s6 += right3.s6 * w2.s2;

477

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

478

ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);

479

ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

480

values1.s7 += right3.s7 * w2.s2;

481

#endif // CONV_STRIDE_Y == 1

482

483

#if defined(HAS_BIAS)

484

values0 += (int8)(bias_value);

485

#if CONV_STRIDE_Y == 1

486

values1 += (int8)(bias_value);

487

#endif /* CONV_STRIDE_Y == 1 */

488

#endif //defined(HAS_BIAS)

489

490

#if WEIGHTS_OFFSET != 0

491

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

492

#if CONV_STRIDE_Y == 1

493

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

494

#endif /* CONV_STRIDE_Y == 1 */

495

#endif /* WEIGHTS_OFFSET != 0 */

496

497

#if INPUT_OFFSET != 0

498

ushort sum_weights = 0;

499

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

500

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

501

values0 += sum_weights * (int8)(INPUT_OFFSET);

502

#if CONV_STRIDE_Y == 1

503

values1 += sum_weights * (int8)(INPUT_OFFSET);

504

#endif /* CONV_STRIDE_Y == 1 */

505

#endif /* INPUT_OFFSET != 0 */

506

507

#if K_OFFSET != 0

508

values0 += (int8)(K_OFFSET);

509

#if CONV_STRIDE_Y == 1

510

values1 += (int8)(K_OFFSET);

511

#endif /* CONV_STRIDE_Y == 1 */

512

#endif /* K_OFFSET != 0 */

513

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

514

#if defined(REAL_MULTIPLIER)

515

516

values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);

517

518

#else // defined(REAL_MULTIPLIER)

519

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

520

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

521

522

#endif // defined(REAL_MULTIPLIER)

523

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

524

values0 += (int8)OUTPUT_OFFSET;

525

uchar8 res0 = convert_uchar8_sat(values0);

526

res0 = max(res0, (uchar8)0);

527

res0 = min(res0, (uchar8)255);

528

529

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

530

#if CONV_STRIDE_Y == 1

531

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

532

#if defined(REAL_MULTIPLIER)

533

534

values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);

535

536

#else // defined(REAL_MULTIPLIER)

537

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

538

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

539

540

#endif // defined(REAL_MULTIPLIER)

541

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

542

values1 += (int8)OUTPUT_OFFSET;

543

uchar8 res1 = convert_uchar8_sat(values1);

544

res1 = max(res1, (uchar8)0);

545

res1 = min(res1, (uchar8)255);

546

547

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

548

#endif /* CONV_STRIDE_Y == 1 */

549

}

550

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

551

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

552

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

553

#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

554

555

#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

556

557

#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)

558

559

#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

560

#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

561

#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)

562

#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)

563

564

#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)

565

566

#if WEIGHTS_OFFSET != 0

567

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

568

({ \

569

sum += CONVERT(x, VEC_INT); \

570

MULTIPLY_ADD(x, y, acc); \

571

})

572

#else /* WEIGHTS_OFFSET != 0 */

573

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

574

#endif /* WEIGHTS_OFFSET != 0 */

575

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

576

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

577

#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

578

({ \

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

579

ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0); \

580

ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

581

acc.s0 += val8.s0 * w8.s0; \

582

\

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

583

ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1); \

584

ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

585

acc.s1 += val8.s1 * w8.s1; \

586

\

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

587

ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2); \

588

ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

589

acc.s2 += val8.s2 * w8.s2; \

590

\

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

591

ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3); \

592

ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

593

acc.s3 += val8.s3 * w8.s3; \

594

})

595

596

#if WEIGHTS_OFFSET != 0

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

597

#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

598

({ \

599

ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0); \

600

ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0); \

601

ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0); \

602

\

603

ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1); \

604

ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1); \

605

ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1); \

606

\

607

ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2); \

608

ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2); \

609

ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2); \

610

\

611

ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3); \

612

ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3); \

613

ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

614

})

615

#else /* WEIGHTS_OFFSET != 0 */

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

616

#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

617

#endif /* WEIGHTS_OFFSET != 0 */

618

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

619

#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \

620

({ \

621

sum = CONVERT(val0, VEC_INT); \

622

ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0); \

623

ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0); \

624

\

625

ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1); \

626

ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1); \

627

\

628

ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2); \

629

ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2); \

630

\

631

ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3); \

632

ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3); \

633

})

634

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

635

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

636

637

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

638

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.

639

*

640

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

641

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

642

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

643

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

644

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

645

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

646

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

647

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

648

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

649

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

650

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

651

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

652

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

653

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

654

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

655

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

656

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

657

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

658

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

659

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

660

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

661

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

662

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

663

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

664

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

665

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

666

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

667

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

668

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

669

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

670

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

671

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

672

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

673

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

674

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

675

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

676

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

677

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

678

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

679

* @param[in] max_offset Max offset for the input tensor

680

*/

681

__kernel void depthwise_convolution_3x3_quantized_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

682

TENSOR4D_DECLARATION(src),

683

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

684

TENSOR3D_DECLARATION(weights),

685

#if defined(HAS_BIAS)

686

VECTOR_DECLARATION(biases),

687

#endif /* defined(HAS_BIAS) */

688

int max_offset)

689

{

690

const int x = get_global_id(0); // channels

691

const int y = get_global_id(1); // spatial coordinate x

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

692

#if defined(DST_DEPTH)

693

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

694

int b = get_global_id(2) / (int)DST_DEPTH; // batch

695

#else /* defined(DST_DEPTH) */

696

int z = get_global_id(2); // spatial coordinate y

697

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

698

699

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

700

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

701

#if defined(DST_DEPTH)

702

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

703

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

704

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

705

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

706

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

707

int z_coord = 0;

708

int4 offset = 0;

709

int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

710

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

711

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

712

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

713

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

714

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

715

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

716

717

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

718

719

// We compute 4x1x1 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

720

VEC_INT acc = 0, sum = 0;

721

722

// Load weights

723

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

724

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

725

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

726

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

727

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

728

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

729

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

730

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

731

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

732

733

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

734

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

735

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

736

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

737

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

742

// z_coord is casted to unsigned int in order to use just a min() operation

743

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

744

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

745

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

746

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

747

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

748

749

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

750

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

751

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

752

753

// z == 1

754

// z_coord can be only negative for z = 0 so we do not need to clamp it

755

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

756

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

757

offset = y_offset + (int4)(z_coord * src_stride_z);

758

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

759

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

760

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

761

762

// z == 2

763

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

764

// However offset can be out-of-bound so we need to check if it is greater than max_offset

765

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

766

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

767

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

768

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

769

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

770

771

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);

772

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);

773

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);

774

775

MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);

776

MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);

777

MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);

778

779

MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);

780

MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);

781

MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);

782

783

#if defined(HAS_BIAS)

784

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

785

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

786

acc += bias_values;

787

#endif // defined(HAS_BIAS)

788

789

#if WEIGHTS_OFFSET != 0

790

acc += WEIGHTS_OFFSET * sum;

791

#endif /* WEIGHTS_OFFSET != 0 */

792

793

#if INPUT_OFFSET != 0

794

acc += INPUT_OFFSET * sum_we;

795

#endif /* INPUT_OFFSET != 0 */

796

797

#if K_OFFSET != 0

798

acc += (VEC_INT)K_OFFSET;

799

#endif /* K_OFFSET != 0 */

800

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

801

#if defined(REAL_MULTIPLIER)

802

803

acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

804

805

#else // defined(REAL_MULTIPLIER)

806

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

807

acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

808

#endif // defined(REAL_MULTIPLIER)

809

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

810

acc += (VEC_INT)OUTPUT_OFFSET;

811

812

VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);

813

res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);

814

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

815

#if defined(DST_DEPTH)

816

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

817

#else /* defined(DST_DEPTH) */

818

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

819

#endif /* defined(DST_DEPTH) */

820

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

821

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

822

(ACTIVATION_FUNC(res), 0, dst_addr);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

823

}

824

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

825

826

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

827

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1

828

*

829

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

830

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

831

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

832

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

833

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

834

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

835

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

836

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

837

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

838

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

839

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

840

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

841

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

842

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

843

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

844

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

845

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

846

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

847

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

848

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

849

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

850

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

851

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

852

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

853

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

854

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

855

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

856

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

857

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

858

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

859

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

860

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

861

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

862

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

863

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

864

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

865

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

866

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

867

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

868

* @param[in] max_offset Max offset for the input tensor

869

*/

870

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

871

__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

872

TENSOR4D_DECLARATION(src),

873

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

874

TENSOR3D_DECLARATION(weights),

875

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

876

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

877

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

878

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

879

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

880

int x = get_global_id(0);

881

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

882

#if defined(DST_DEPTH)

883

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

884

int b = get_global_id(2) / (int)DST_DEPTH; // batch

885

#else /* defined(DST_DEPTH) */

886

int z = get_global_id(2); // spatial coordinate y

887

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

888

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

889

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

890

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

891

#if defined(DST_DEPTH)

892

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

893

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

894

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

895

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

896

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

897

int z_coord = 0;

898

int4 offset = 0;

899

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

900

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

901

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

902

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

903

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

904

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

905

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

906

907

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

908

909

// We compute 4x2x2 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

910

VEC_INT acc0 = 0, sum0 = 0;

911

VEC_INT acc1 = 0, sum1 = 0;

912

VEC_INT acc2 = 0, sum2 = 0;

913

VEC_INT acc3 = 0, sum3 = 0;

914

915

// Load weights

916

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

917

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

918

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

919

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

920

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

921

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

922

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

923

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

924

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

925

926

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

927

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

928

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

929

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

930

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

935

// z_coord is casted to unsigned int in order to use just a min() operation

936

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

937

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

938

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

939

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

940

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

941

942

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

943

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

944

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

945

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

946

947

// z == 1

948

// z_coord can be only negative for z = 0 so we do not need to clamp it

949

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

950

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

951

offset = y_offset + (int4)(z_coord * src_stride_z);

952

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

953

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

954

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

955

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

956

957

// z == 2

958

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

959

// However offset can be out-of-bound so we need to check if it is greater than max_offset

960

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

961

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

962

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

963

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

964

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

965

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

966

967

// z == 3

968

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

969

// However offset can be out-of-bound so we need to check if it is greater than max_offset

970

offset += (int4)(src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

971

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

972

VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

973

VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

974

VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

975

VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

976

977

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);

978

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);

979

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);

980

MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);

981

MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);

982

MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);

983

984

MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);

985

MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);

986

MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);

987

MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);

988

MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);

989

MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);

990

991

MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);

992

MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);

993

MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);

994

MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);

995

MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);

996

MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);

997

998

MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);

999

MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);

1000

MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);

1001

MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);

1002

MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);

1003

MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);

1004

1005

MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);

1006

MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);

1007

MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);

1008

MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);

1009

MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);

1010

MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);

1011

1012

MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);

1013

MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);

1014

MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);

1015

MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);

1016

MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);

1017

MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);

1018

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1019

#if defined(HAS_BIAS)

1020

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1021

1022

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1023

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

1029

1030

#if WEIGHTS_OFFSET != 0

1031

acc0 += WEIGHTS_OFFSET * sum0;

1032

acc1 += WEIGHTS_OFFSET * sum1;

1033

acc2 += WEIGHTS_OFFSET * sum2;

1034

acc3 += WEIGHTS_OFFSET * sum3;

1035

#endif /* WEIGHTS_OFFSET != 0 */

1036

1037

#if INPUT_OFFSET != 0

1038

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

1045

1046

#if K_OFFSET != 0

1047

acc0 += (VEC_INT)K_OFFSET;

1048

acc1 += (VEC_INT)K_OFFSET;

1049

acc2 += (VEC_INT)K_OFFSET;

1050

acc3 += (VEC_INT)K_OFFSET;

1051

#endif /* K_OFFSET != 0 */

1052

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1053

#if defined(REAL_MULTIPLIER)

1054

1055

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1056

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1057

acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1058

acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1059

1060

#else // defined(REAL_MULTIPLIER)

1061

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1062

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1063

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1064

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1065

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1066

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1067

#endif // defined(REAL_MULTIPLIER)

1068

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1069

acc0 += (VEC_INT)OUTPUT_OFFSET;

1070

acc1 += (VEC_INT)OUTPUT_OFFSET;

1071

acc2 += (VEC_INT)OUTPUT_OFFSET;

1072

acc3 += (VEC_INT)OUTPUT_OFFSET;

1073

1074

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

1075

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

1076

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

1077

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

1078

1079

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

1080

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

1081

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

1082

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

1083

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1084

#if defined(DST_DEPTH)

1085

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;

1086

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1087

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1088

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1089

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1090

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1091

(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1092

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1093

(ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1094

1095

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1096

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

1097

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1098

{

1099

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1100

(ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1101

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1102

(ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1103

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1104

}

1105

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

1106

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1107

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product

1108

*

1109

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1110

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1111

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1112

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1113

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1114

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1115

* @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.

1116

* If not, the quantization will be performed using a fixed point multiplication

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1117

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1118

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

1119

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1120

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1121

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1122

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1123

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1124

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1125

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1126

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1127

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1128

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

1129

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1130

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1131

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1132

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1133

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1134

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1135

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1136

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1137

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1138

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

1139

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1140

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1141

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1142

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1143

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1144

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1145

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1146

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

1147

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1148

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1149

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1150

* @param[in] max_offset The maximum allowed offset for the input tensor

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1151

*/

1152

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1153

__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1154

TENSOR4D_DECLARATION(src),

1155

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1156

TENSOR3D_DECLARATION(weights),

1157

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1158

VECTOR_DECLARATION(biases),

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1159

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1160

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1161

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1162

int x = get_global_id(0);

1163

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1164

#if defined(DST_DEPTH)

1165

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1166

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1167

#else /* defined(DST_DEPTH) */

1168

int z = get_global_id(2); // spatial coordinate y

1169

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1170

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1171

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1172

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1173

#if defined(DST_DEPTH)

1174

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

1175

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1176

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1177

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1178

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1179

int z_coord = 0;

1180

int4 offset = 0;

1181

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1182

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1183

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

1184

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

1185

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

1186

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

1187

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

1188

1189

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

1190

1191

// We compute 4x2x1 [C,W,H] elements

VEC_INT acc0 = 0;

VEC_INT acc1 = 0;

VEC_INT sum0 = 0;

VEC_INT sum1 = 0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1196

1197

// Load weights

1198

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

1199

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

1200

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

1201

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

1202

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

1203

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

1204

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

1205

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

1206

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

1207

1208

#if INPUT_OFFSET != 0

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1209

// Initilize the final result with the weights reduction multiplied by INPUT_OFFSET

1210

DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1211

1212

// Multiply the weights reduction with INPUT_OFFSET

1213

acc0 = INPUT_OFFSET * acc0;

1214

1215

acc1 = acc0;

1216

#endif // INPUT_OFFSET != 0

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1221

// z_coord is casted to unsigned int in order to use just a min() operation

1222

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1223

z_coord = z - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1224

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1225

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1226

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1227

1228

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1229

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1230

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1231

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1232

1233

// z == 1

1234

// z_coord can be only negative for z = 0 so we do not need to clamp it

1235

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1236

z_coord = z - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1237

offset = y_offset + (int4)(z_coord * src_stride_z);

1238

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1239

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1240

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1241

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1242

1243

// z == 2

1244

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1245

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1246

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1247

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1248

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1249

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1250

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1251

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1252

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1253

DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);

1254

DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1255

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1256

DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);

1257

DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1258

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1259

#if defined(HAS_BIAS)

1260

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1261

1262

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1263

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1264

acc0 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1265

acc1 += bias_values;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1266

1267

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1268

1269

#if WEIGHTS_OFFSET != 0

1270

acc0 += WEIGHTS_OFFSET * sum0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1271

acc1 += WEIGHTS_OFFSET * sum1;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1272

#endif // WEIGHTS_OFFSET != 0

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1273

1274

#if K_OFFSET != 0

1275

acc0 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1276

acc1 += (VEC_INT)K_OFFSET;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1277

1278

#endif // K_OFFSET != 0

1279

1280

#if defined(REAL_MULTIPLIER)

1281

1282

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1283

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1284

1285

#else // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1286

1287

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1288

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1289

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1290

#endif // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1291

acc0 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1292

acc1 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1293

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1294

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1295

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1296

1297

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

1298

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1299

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1300

#if defined(DST_DEPTH)

1301

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1302

#else /* defined(DST_DEPTH) */

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1303

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame^]

1304

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1305

1306

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1307

(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1308

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-22 16:17:20 +0100

[diff] [blame]

1309

(ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1310

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1311

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

1312

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1313

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1314

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1315

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1316

#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

1317

Gian Marco Iodice