Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2018-02-16 11:01:04 +0000

[diff] [blame]

28

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

29

#if defined(FUSED_ACTIVATION)

30

#define DATA_TYPE uchar

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

31

#ifndef VEC_SIZE

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

32

#define VEC_SIZE 8

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

33

#endif /* VEC_SIZE */

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

34

#include "activation_layer_qa8.cl"

35

#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)

36

#else /* defined(FUSED_ACTIVATION) */

37

#define ACTIVATION_FUNC(x) (x)

38

#endif /* defined(FUSED_ACTIVATION) */

39

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

40

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

41

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

42

#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

43

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

44

#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

45

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

46

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

47

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

48

#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

49

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

50

#if CONV_STRIDE_X > 3

51

#error "Stride X not supported"

52

#endif /* CONV_STRIDE_X > 3 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

53

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

54

#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

55

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

56

#if CONV_STRIDE_X == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

57

#define GET_VALUES(first_value, left, middle, right) \

58

({ \

59

int8 temp0 = CONVERT(vload8(0, first_value), int8); \

60

int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \

61

\

62

left = CONVERT(temp0.s01234567, int8); \

63

middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \

64

right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \

65

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

66

#elif CONV_STRIDE_X == 2

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

67

#define GET_VALUES(first_value, left, middle, right) \

68

({ \

69

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

70

int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \

71

\

72

left = CONVERT(temp0.s02468ace, int8); \

73

middle = CONVERT(temp0.s13579bdf, int8); \

74

right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \

75

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

76

#else /* CONV_STRIDE_X */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

77

#define GET_VALUES(first_value, left, middle, right) \

78

({ \

79

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

80

int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \

81

\

82

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

83

middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \

84

right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \

85

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

86

#endif /* CONV_STRIDE_X */

87

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

88

/** This function computes the depthwise convolution quantized.

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

89

*

90

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

91

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

92

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

93

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

94

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

95

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

96

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

97

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

98

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

99

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

100

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

101

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

102

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

103

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

104

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

105

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

106

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

107

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

108

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

109

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

110

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

111

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

112

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

113

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

114

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

115

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

116

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

117

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

118

*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

119

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

120

__kernel void depthwise_convolution_3x3_quantized_nchw(

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

121

TENSOR3D_DECLARATION(src),

122

TENSOR3D_DECLARATION(dst),

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

123

TENSOR3D_DECLARATION(weights)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

124

#if defined(HAS_BIAS)

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

125

,

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

126

VECTOR_DECLARATION(biases)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

127

#endif //defined(HAS_BIAS)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

128

)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

129

{

130

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

131

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

132

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

133

134

// Extract channel and linearized batch indices

135

const int channel = get_global_id(2) % DST_CHANNELS;

136

const int batch = get_global_id(2) / DST_CHANNELS;

137

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

138

#if defined(HAS_BIAS)

139

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

140

Georgios Pinitas

728d3cf

2018-09-21 13:41:35 +0100

[diff] [blame^]

141

int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

142

#endif //defined(HAS_BIAS)

143

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

144

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

145

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

146

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

7657224

2018-04-04 17:44:26 +0100

[diff] [blame]

147

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

148

uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);

149

uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);

150

uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

151

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

152

int8 values0 = 0;

153

int8 sum0 = 0;

154

#if CONV_STRIDE_Y == 1

155

int8 values1 = 0;

156

int8 sum1 = 0;

157

#endif /* CONV_STRIDE_Y */

158

159

// Row0

160

int8 left, middle, right;

161

GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);

162

values0 += left * (int8)(w0.s0);

163

values0 += middle * (int8)(w0.s1);

164

values0 += right * (int8)(w0.s2);

165

166

#if WEIGHTS_OFFSET != 0

167

sum0 += left + middle + right;

168

#endif /* WEIGHTS_OFFSET != 0 */

169

170

// Row1

171

GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);

172

values0 += left * (int8)(w1.s0);

173

values0 += middle * (int8)(w1.s1);

174

values0 += right * (int8)(w1.s2);

175

#if CONV_STRIDE_Y == 1

176

values1 += left * (int8)(w0.s0);

177

values1 += middle * (int8)(w0.s1);

178

values1 += right * (int8)(w0.s2);

179

#endif /* CONV_STRIDE_Y == 1 */

180

181

#if WEIGHTS_OFFSET != 0

182

int8 tmp = left + middle + right;

183

sum0 += tmp;

184

#if CONV_STRIDE_Y == 1

185

sum1 += tmp;

186

#endif /* CONV_STRIDE_Y == 1 */

187

#endif /* WEIGHTS_OFFSET != 0 */

188

189

// Row2

190

GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);

191

values0 += left * (int8)(w2.s0);

192

values0 += middle * (int8)(w2.s1);

193

values0 += right * (int8)(w2.s2);

194

#if CONV_STRIDE_Y == 1

195

values1 += left * (int8)(w1.s0);

196

values1 += middle * (int8)(w1.s1);

197

values1 += right * (int8)(w1.s2);

198

#endif /* CONV_STRIDE_Y == 1 */

199

200

#if WEIGHTS_OFFSET != 0

201

tmp = left + middle + right;

202

sum0 += tmp;

203

#if CONV_STRIDE_Y == 1

204

sum1 += tmp;

205

#endif /* CONV_STRIDE_Y == 1 */

206

#endif /* WEIGHTS_OFFSET != 0 */

207

208

#if CONV_STRIDE_Y == 1

209

// Row3

210

GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);

211

values1 += left * (int8)(w2.s0);

212

values1 += middle * (int8)(w2.s1);

213

values1 += right * (int8)(w2.s2);

214

215

#if WEIGHTS_OFFSET != 0

216

sum1 += left + middle + right;

217

#endif /* WEIGHTS_OFFSET != 0 */

218

#endif /* CONV_STRIDE_Y == 1 */

219

220

#if defined(HAS_BIAS)

221

values0 += (int8)(bias_value);

222

#if CONV_STRIDE_Y == 1

223

values1 += (int8)(bias_value);

224

#endif /* CONV_STRIDE_Y == 1 */

225

#endif //defined(HAS_BIAS)

226

227

#if WEIGHTS_OFFSET != 0

228

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

229

#if CONV_STRIDE_Y == 1

230

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

231

#endif /* CONV_STRIDE_Y == 1 */

232

#endif /* WEIGHTS_OFFSET != 0 */

233

234

#if INPUT_OFFSET != 0

235

ushort sum_weights = 0;

236

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

237

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

238

values0 += sum_weights * (int8)(INPUT_OFFSET);

239

#if CONV_STRIDE_Y == 1

240

values1 += sum_weights * (int8)(INPUT_OFFSET);

241

#endif /* CONV_STRIDE_Y == 1 */

242

#endif /* INPUT_OFFSET != 0 */

243

244

#if K_OFFSET != 0

245

values0 += (int8)(K_OFFSET);

246

#if CONV_STRIDE_Y == 1

247

values1 += (int8)(K_OFFSET);

248

#endif /* CONV_STRIDE_Y == 1 */

249

#endif /* K_OFFSET != 0 */

250

251

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

252

values0 += (int8)OUTPUT_OFFSET;

253

uchar8 res0 = convert_uchar8_sat(values0);

254

res0 = max(res0, (uchar8)0);

255

res0 = min(res0, (uchar8)255);

256

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

257

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

258

#if CONV_STRIDE_Y == 1

259

260

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

261

values1 += (int8)OUTPUT_OFFSET;

262

uchar8 res1 = convert_uchar8_sat(values1);

263

res1 = max(res1, (uchar8)0);

264

res1 = min(res1, (uchar8)255);

265

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

266

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

267

#endif /* CONV_STRIDE_Y == 1 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

268

}

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

269

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

270

#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

271

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

272

#if CONV_STRIDE_X == 1

273

#define GET_VALUES(first_value, left, middle, right) \

274

({ \

275

uchar8 temp0 = vload8(0, first_value); \

276

uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \

277

\

278

left = temp0.s01234567; \

279

middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \

280

right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

281

})

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

282

#elif CONV_STRIDE_X == 2

283

#define GET_VALUES(first_value, left, middle, right) \

284

({ \

285

uchar16 temp0 = vload16(0, first_value); \

286

uchar temp1 = *(first_value + 16 * sizeof(uchar)); \

287

\

288

left = temp0.s02468ace; \

289

middle = temp0.s13579bdf; \

290

right = (uchar8)(temp0.s2468, temp0.sace, temp1); \

291

})

292

#else /* CONV_STRIDE_X */

293

#define GET_VALUES(first_value, left, middle, right) \

294

({ \

295

uchar16 temp0 = vload16(0, first_value); \

296

uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \

297

\

298

left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \

299

middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \

300

right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \

301

})

302

#endif /* CONV_STRIDE_X */

303

/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

304

*

305

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

306

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

307

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

308

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

309

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

310

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

311

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

312

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

313

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

314

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

315

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

316

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

317

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

318

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

319

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

320

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

321

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

322

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

323

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

324

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

325

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

326

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

327

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

328

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

329

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

330

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

331

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

332

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

333

*/

334

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

335

__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(

336

TENSOR3D_DECLARATION(src),

337

TENSOR3D_DECLARATION(dst),

338

TENSOR3D_DECLARATION(weights)

339

#if defined(HAS_BIAS)

340

,

341

VECTOR_DECLARATION(biases)

342

#endif //defined(HAS_BIAS)

343

)

344

{

345

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

346

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

347

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

348

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

349

// Extract channel and linearized batch indices

350

const int channel = get_global_id(2) % DST_CHANNELS;

351

const int batch = get_global_id(2) / DST_CHANNELS;

352

353

#if defined(HAS_BIAS)

354

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

355

356

const int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

357

#endif //defined(HAS_BIAS)

358

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

359

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

360

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

361

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

362

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

363

uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);

364

uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);

365

uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

366

367

uchar8 left0, middle0, right0;

368

uchar8 left1, middle1, right1;

369

uchar8 left2, middle2, right2;

int8 values0 = 0;

int8 sum0 = 0;

GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);

375

GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);

376

GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);

377

378

#if WEIGHTS_OFFSET != 0

379

sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);

380

sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

381

sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

382

#endif /* WEIGHTS_OFFSET != 0 */

383

384

#if CONV_STRIDE_Y == 1

385

// If conv_stride_y is equals to 1, we compute two output rows

386

387

uchar8 left3, middle3, right3;

int8 values1 = 0;

int8 sum1 = 0;

GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);

392

393

#if WEIGHTS_OFFSET != 0

394

sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

395

sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

396

sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);

397

#endif /* WEIGHTS_OFFSET != 0 */

398

#endif // CONV_STRIDE_Y == 1

399

400

ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0);

401

ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0);

402

values0.s0 += right2.s0 * w2.s2;

403

404

ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1);

405

ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1);

406

values0.s1 += right2.s1 * w2.s2;

407

408

ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2);

409

ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2);

410

values0.s2 += right2.s2 * w2.s2;

411

412

ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3);

413

ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3);

414

values0.s3 += right2.s3 * w2.s2;

415

416

ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4);

417

ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4);

418

values0.s4 += right2.s4 * w2.s2;

419

420

ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5);

421

ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5);

422

values0.s5 += right2.s5 * w2.s2;

423

424

ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6);

425

ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6);

426

values0.s6 += right2.s6 * w2.s2;

427

428

ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7);

429

ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7);

430

values0.s7 += right2.s7 * w2.s2;

431

432

#if CONV_STRIDE_Y == 1

433

ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0);

434

ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0);

435

values1.s0 += right3.s0 * w2.s2;

436

437

ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1);

438

ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1);

439

values1.s1 += right3.s1 * w2.s2;

440

441

ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2);

442

ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2);

443

values1.s2 += right3.s2 * w2.s2;

444

445

ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3);

446

ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3);

447

values1.s3 += right3.s3 * w2.s2;

448

449

ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4);

450

ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4);

451

values1.s4 += right3.s4 * w2.s2;

452

453

ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5);

454

ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5);

455

values1.s5 += right3.s5 * w2.s2;

456

457

ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6);

458

ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6);

459

values1.s6 += right3.s6 * w2.s2;

460

461

ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7);

462

ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7);

463

values1.s7 += right3.s7 * w2.s2;

464

#endif // CONV_STRIDE_Y == 1

465

466

#if defined(HAS_BIAS)

467

values0 += (int8)(bias_value);

468

#if CONV_STRIDE_Y == 1

469

values1 += (int8)(bias_value);

470

#endif /* CONV_STRIDE_Y == 1 */

471

#endif //defined(HAS_BIAS)

472

473

#if WEIGHTS_OFFSET != 0

474

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

475

#if CONV_STRIDE_Y == 1

476

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

477

#endif /* CONV_STRIDE_Y == 1 */

478

#endif /* WEIGHTS_OFFSET != 0 */

479

480

#if INPUT_OFFSET != 0

481

ushort sum_weights = 0;

482

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

483

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

484

values0 += sum_weights * (int8)(INPUT_OFFSET);

485

#if CONV_STRIDE_Y == 1

486

values1 += sum_weights * (int8)(INPUT_OFFSET);

487

#endif /* CONV_STRIDE_Y == 1 */

488

#endif /* INPUT_OFFSET != 0 */

489

490

#if K_OFFSET != 0

491

values0 += (int8)(K_OFFSET);

492

#if CONV_STRIDE_Y == 1

493

values1 += (int8)(K_OFFSET);

494

#endif /* CONV_STRIDE_Y == 1 */

495

#endif /* K_OFFSET != 0 */

496

497

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

498

values0 += (int8)OUTPUT_OFFSET;

499

uchar8 res0 = convert_uchar8_sat(values0);

500

res0 = max(res0, (uchar8)0);

501

res0 = min(res0, (uchar8)255);

502

503

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

504

#if CONV_STRIDE_Y == 1

505

506

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

507

values1 += (int8)OUTPUT_OFFSET;

508

uchar8 res1 = convert_uchar8_sat(values1);

509

res1 = max(res1, (uchar8)0);

510

res1 = min(res1, (uchar8)255);

511

512

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

513

#endif /* CONV_STRIDE_Y == 1 */

514

}

515

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

516

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

517

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

518

#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

519

520

#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

521

522

#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)

523

524

#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)

525

#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)

526

#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)

527

528

#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)

529

530

#if WEIGHTS_OFFSET != 0

531

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

532

({ \

533

sum += CONVERT(x, VEC_INT); \

534

MULTIPLY_ADD(x, y, acc); \

535

})

536

#else /* WEIGHTS_OFFSET != 0 */

537

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

538

#endif /* WEIGHTS_OFFSET != 0 */

539

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

540

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

541

#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

542

({ \

543

ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \

544

ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0); \

545

acc.s0 += val8.s0 * w8.s0; \

546

\

547

ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1); \

548

ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1); \

549

acc.s1 += val8.s1 * w8.s1; \

550

\

551

ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2); \

552

ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2); \

553

acc.s2 += val8.s2 * w8.s2; \

554

\

555

ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3); \

556

ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3); \

557

acc.s3 += val8.s3 * w8.s3; \

558

})

559

560

#if WEIGHTS_OFFSET != 0

561

#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \

562

({ \

563

sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \

564

DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8); \

565

})

566

#else /* WEIGHTS_OFFSET != 0 */

567

#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)

568

#endif /* WEIGHTS_OFFSET != 0 */

569

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

570

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

571

572

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

573

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.

574

*

575

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

576

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

577

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

578

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

579

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

580

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

581

*

582

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

583

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

584

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

585

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

586

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

587

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

588

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

589

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

590

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

591

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

592

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

593

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

594

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

595

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

596

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

597

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

598

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

599

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

600

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

601

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

602

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

603

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

604

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

605

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

606

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

607

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

608

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

609

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

610

* @param[in] max_offset Max offset for the input tensor

611

*/

612

__kernel void depthwise_convolution_3x3_quantized_nhwc(

613

TENSOR3D_DECLARATION(src),

614

TENSOR3D_DECLARATION(dst),

615

TENSOR3D_DECLARATION(weights),

616

#if defined(HAS_BIAS)

617

VECTOR_DECLARATION(biases),

618

#endif /* defined(HAS_BIAS) */

619

int max_offset)

620

{

621

const int x = get_global_id(0); // channels

622

const int y = get_global_id(1); // spatial coordinate x

623

const int z = get_global_id(2); // spatial coordinate y

624

625

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

626

627

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

628

629

int z_coord = 0;

630

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

631

const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

632

633

// We compute 2x1x1 [C,W,H] elements

634

VEC_INT acc = 0, sum = 0;

635

636

// Load weights

637

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

638

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

639

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

640

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

641

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

642

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

643

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

644

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

645

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

646

647

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

648

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

649

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

650

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

651

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

656

// z_coord is casted to unsigned int in order to use just a min() operation

657

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

658

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

659

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

660

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

661

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

662

663

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

664

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

665

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

666

667

// z == 1

668

// z_coord can be only negative for z = 0 so we do not need to clamp it

669

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

670

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

671

offset = y_offset + (int4)(z_coord * src_stride_z);

672

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

673

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

674

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

675

676

// z == 2

677

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

678

// However offset can be out-of-bound so we need to check if it is greater than max_offset

679

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

680

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

681

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

682

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

683

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

684

685

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);

686

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);

687

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);

688

689

MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);

690

MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);

691

MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);

692

693

MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);

694

MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);

695

MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);

696

697

#if defined(HAS_BIAS)

698

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

699

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

700

acc += bias_values;

701

#endif // defined(HAS_BIAS)

702

703

#if WEIGHTS_OFFSET != 0

704

acc += WEIGHTS_OFFSET * sum;

705

#endif /* WEIGHTS_OFFSET != 0 */

706

707

#if INPUT_OFFSET != 0

708

acc += INPUT_OFFSET * sum_we;

709

#endif /* INPUT_OFFSET != 0 */

710

711

#if K_OFFSET != 0

712

acc += (VEC_INT)K_OFFSET;

713

#endif /* K_OFFSET != 0 */

714

715

acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

716

acc += (VEC_INT)OUTPUT_OFFSET;

717

718

VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);

719

res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);

720

721

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

VSTORE(VEC_SIZE)

(res, 0, dst.ptr);

}

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

726

727

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

728

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1

729

*

730

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

731

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

732

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

733

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

734

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

735

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

736

*

737

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

738

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

739

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

740

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

741

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

742

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

743

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

744

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

745

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

746

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

747

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

748

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

749

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

750

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

751

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

752

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

753

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

754

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

755

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

756

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

757

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

758

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

759

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

760

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

761

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

762

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

763

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

764

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

765

* @param[in] max_offset Max offset for the input tensor

766

*/

767

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

768

__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(

769

TENSOR3D_DECLARATION(src),

770

TENSOR3D_DECLARATION(dst),

771

TENSOR3D_DECLARATION(weights),

772

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

773

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

774

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

775

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

776

{

Giorgio Arena

fa23f11

2018-06-19 11:27:38 +0100

[diff] [blame]

777

int x = get_global_id(0);

778

int y = get_global_id(1);

779

int z = get_global_id(2);

780

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

781

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

782

783

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

784

785

int z_coord = 0;

786

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

787

int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

788

789

// We compute 2x2x2 [C,W,H] elements

790

VEC_INT acc0 = 0, sum0 = 0;

791

VEC_INT acc1 = 0, sum1 = 0;

792

VEC_INT acc2 = 0, sum2 = 0;

793

VEC_INT acc3 = 0, sum3 = 0;

794

795

// Load weights

796

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

797

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

798

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

799

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

800

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

801

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

802

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

803

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

804

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

805

806

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

807

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

808

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

809

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

810

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

815

// z_coord is casted to unsigned int in order to use just a min() operation

816

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

817

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

818

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

819

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

820

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

821

822

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

823

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

824

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

825

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

826

827

// z == 1

828

// z_coord can be only negative for z = 0 so we do not need to clamp it

829

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

830

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

831

offset = y_offset + (int4)(z_coord * src_stride_z);

832

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

833

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

834

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

835

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

836

837

// z == 2

838

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

839

// However offset can be out-of-bound so we need to check if it is greater than max_offset

840

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

841

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

842

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

843

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

844

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

845

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

846

847

// z == 3

848

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

849

// However offset can be out-of-bound so we need to check if it is greater than max_offset

850

offset += (int4)(src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

851

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

852

VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

853

VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

854

VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

855

VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

856

857

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);

858

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);

859

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);

860

MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);

861

MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);

862

MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);

863

864

MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);

865

MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);

866

MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);

867

MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);

868

MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);

869

MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);

870

871

MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);

872

MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);

873

MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);

874

MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);

875

MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);

876

MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);

877

878

MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);

879

MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);

880

MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);

881

MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);

882

MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);

883

MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);

884

885

MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);

886

MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);

887

MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);

888

MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);

889

MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);

890

MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);

891

892

MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);

893

MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);

894

MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);

895

MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);

896

MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);

897

MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);

898

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

899

#if defined(HAS_BIAS)

900

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

901

902

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

903

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

909

910

#if WEIGHTS_OFFSET != 0

911

acc0 += WEIGHTS_OFFSET * sum0;

912

acc1 += WEIGHTS_OFFSET * sum1;

913

acc2 += WEIGHTS_OFFSET * sum2;

914

acc3 += WEIGHTS_OFFSET * sum3;

915

#endif /* WEIGHTS_OFFSET != 0 */

916

917

#if INPUT_OFFSET != 0

918

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

925

926

#if K_OFFSET != 0

927

acc0 += (VEC_INT)K_OFFSET;

928

acc1 += (VEC_INT)K_OFFSET;

929

acc2 += (VEC_INT)K_OFFSET;

930

acc3 += (VEC_INT)K_OFFSET;

931

#endif /* K_OFFSET != 0 */

932

933

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

934

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

935

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

936

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

937

938

acc0 += (VEC_INT)OUTPUT_OFFSET;

939

acc1 += (VEC_INT)OUTPUT_OFFSET;

940

acc2 += (VEC_INT)OUTPUT_OFFSET;

941

acc3 += (VEC_INT)OUTPUT_OFFSET;

942

943

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

944

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

945

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

946

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

947

948

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

949

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

950

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

951

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

952

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

953

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

954

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

955

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

956

(res0, 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

957

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

958

(res1, 0, dst_addr + 1 * dst_stride_y);

959

960

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

961

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

962

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

963

{

964

VSTORE(VEC_SIZE)

965

(res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);

966

VSTORE(VEC_SIZE)

967

(res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);

968

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

969

}

970

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

971

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

972

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product

973

*

974

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

975

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

976

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

977

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

978

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

979

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

980

*

981

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

982

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

983

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

984

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

985

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

986

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

987

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

988

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

989

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

990

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

991

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

992

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

993

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

994

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

995

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

996

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

997

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

998

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

999

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1000

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1001

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1002

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1003

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1004

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1005

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

1006

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1007

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1008

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1009

*/

1010

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1011

__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1012

TENSOR3D_DECLARATION(src),

1013

TENSOR3D_DECLARATION(dst),

1014

TENSOR3D_DECLARATION(weights),

1015

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1016

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1017

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1018

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1019

{

Giorgio Arena

fa23f11

2018-06-19 11:27:38 +0100

[diff] [blame]

1020

int x = get_global_id(0);

1021

int y = get_global_id(1);

1022

int z = get_global_id(2);

1023

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1024

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1025

1026

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

1027

1028

int z_coord = 0;

1029

int4 offset = 0;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1030

int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1031

1032

// We compute 2x2x2 [C,W,H] elements

1033

VEC_INT acc0 = 0, sum0 = 0;

1034

VEC_INT acc1 = 0, sum1 = 0;

1035

VEC_INT acc2 = 0, sum2 = 0;

1036

VEC_INT acc3 = 0, sum3 = 0;

1037

1038

// Load weights

1039

VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);

1040

VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);

1041

VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);

1042

VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);

1043

VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);

1044

VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);

1045

VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);

1046

VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);

1047

VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);

1048

1049

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1050

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

1051

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

1052

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1053

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1058

// z_coord is casted to unsigned int in order to use just a min() operation

1059

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1060

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1061

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1062

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1063

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1064

1065

VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1066

VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1067

VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1068

VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1069

1070

// z == 1

1071

// z_coord can be only negative for z = 0 so we do not need to clamp it

1072

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1073

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1074

offset = y_offset + (int4)(z_coord * src_stride_z);

1075

VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1076

VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1077

VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1078

VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1079

1080

// z == 2

1081

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1082

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1083

offset += (int4)src_stride_z;

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1084

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1085

VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1086

VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1087

VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1088

VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1089

1090

// z == 3

1091

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1092

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1093

offset += (int4)(src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1094

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1095

VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);

1096

VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);

1097

VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);

1098

VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);

1099

1100

DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1101

DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1102

DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1103

DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8);

1104

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1105

#if defined(HAS_BIAS)

1106

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1107

1108

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1109

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1110

acc0 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1111

acc1 += bias_values;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1112

acc2 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1113

acc3 += bias_values;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1114

#endif /* defined(HAS_BIAS) */

1115

1116

#if WEIGHTS_OFFSET != 0

1117

acc0 += WEIGHTS_OFFSET * sum0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1118

acc1 += WEIGHTS_OFFSET * sum1;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1119

acc2 += WEIGHTS_OFFSET * sum2;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1120

acc3 += WEIGHTS_OFFSET * sum3;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1121

#endif /* WEIGHTS_OFFSET != 0 */

1122

1123

#if INPUT_OFFSET != 0

1124

VEC_INT offs = INPUT_OFFSET * sum_we;

1125

1126

acc0 += offs;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1127

acc1 += offs;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1128

acc2 += offs;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1129

acc3 += offs;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1130

#endif /* INPUT_OFFSET != 0 */

1131

1132

#if K_OFFSET != 0

1133

acc0 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1134

acc1 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1135

acc2 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1136

acc3 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1137

#endif /* K_OFFSET != 0 */

1138

1139

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1140

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1141

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1142

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1143

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1144

acc0 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1145

acc1 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1146

acc2 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1147

acc3 += (VEC_INT)OUTPUT_OFFSET;

1148

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1149

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1150

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1151

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1152

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

1153

1154

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

1155

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

1156

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

1157

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

1158

1159

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1160

1161

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1162

(res0, 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1163

VSTORE(VEC_SIZE)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1164

(res1, 0, dst_addr + 1 * dst_stride_y);

1165

1166

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1167

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

1168

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1169

{

1170

VSTORE(VEC_SIZE)

1171

(res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);

1172

VSTORE(VEC_SIZE)

1173

(res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);

1174

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1175

}

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

1176

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1177

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1178

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1179

Giorgio Arena