Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2018-02-16 11:01:04 +0000

[diff] [blame]

28

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

29

#if defined(FUSED_ACTIVATION)

30

#define DATA_TYPE uchar

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

31

#ifndef VEC_SIZE

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

32

#define VEC_SIZE 8

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

33

#endif /* VEC_SIZE */

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

34

#include "activation_layer_qa8.cl"

35

#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)

36

#else /* defined(FUSED_ACTIVATION) */

37

#define ACTIVATION_FUNC(x) (x)

38

#endif /* defined(FUSED_ACTIVATION) */

39

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

40

#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X)

41

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

42

#if CONV_STRIDE_X > 3

43

#error "Stride X not supported"

44

#endif /* CONV_STRIDE_X > 3 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

45

46

#if CONV_STRIDE_X == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

47

#define GET_VALUES(first_value, left, middle, right) \

48

({ \

49

int8 temp0 = CONVERT(vload8(0, first_value), int8); \

50

int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \

51

\

52

left = CONVERT(temp0.s01234567, int8); \

53

middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \

54

right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \

55

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

56

#elif CONV_STRIDE_X == 2

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

57

#define GET_VALUES(first_value, left, middle, right) \

58

({ \

59

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

60

int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \

61

\

62

left = CONVERT(temp0.s02468ace, int8); \

63

middle = CONVERT(temp0.s13579bdf, int8); \

64

right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \

65

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

66

#else /* CONV_STRIDE_X */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

67

#define GET_VALUES(first_value, left, middle, right) \

68

({ \

69

int16 temp0 = CONVERT(vload16(0, first_value), int16); \

70

int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \

71

\

72

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

73

middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \

74

right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \

75

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

76

#endif /* CONV_STRIDE_X */

77

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

78

/** This function computes the depthwise convolution quantized.

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

79

*

80

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

81

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

82

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

83

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

84

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

85

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

86

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

87

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

88

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

89

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

90

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

91

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

92

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

93

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

94

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

95

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

96

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

97

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

98

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

99

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

100

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

101

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

102

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

103

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

104

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

105

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

106

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

107

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

108

*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

109

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

110

__kernel void depthwise_convolution_3x3_quantized_nchw(

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

111

TENSOR3D_DECLARATION(src),

112

TENSOR3D_DECLARATION(dst),

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

113

TENSOR3D_DECLARATION(weights)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

114

#if defined(HAS_BIAS)

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

115

,

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

116

VECTOR_DECLARATION(biases)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

117

#endif //defined(HAS_BIAS)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

118

)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

119

{

120

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

121

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

122

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

123

#if defined(HAS_BIAS)

124

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

125

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

126

int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));

127

#endif //defined(HAS_BIAS)

128

Giorgio Arena

7657224

2018-04-04 17:44:26 +0100

[diff] [blame]

129

src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

130

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

131

uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);

132

uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);

133

uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

134

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

135

int8 values0 = 0;

136

int8 sum0 = 0;

137

#if CONV_STRIDE_Y == 1

138

int8 values1 = 0;

139

int8 sum1 = 0;

140

#endif /* CONV_STRIDE_Y */

141

142

// Row0

143

int8 left, middle, right;

144

GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);

145

values0 += left * (int8)(w0.s0);

146

values0 += middle * (int8)(w0.s1);

147

values0 += right * (int8)(w0.s2);

148

149

#if WEIGHTS_OFFSET != 0

150

sum0 += left + middle + right;

151

#endif /* WEIGHTS_OFFSET != 0 */

152

153

// Row1

154

GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);

155

values0 += left * (int8)(w1.s0);

156

values0 += middle * (int8)(w1.s1);

157

values0 += right * (int8)(w1.s2);

158

#if CONV_STRIDE_Y == 1

159

values1 += left * (int8)(w0.s0);

160

values1 += middle * (int8)(w0.s1);

161

values1 += right * (int8)(w0.s2);

162

#endif /* CONV_STRIDE_Y == 1 */

163

164

#if WEIGHTS_OFFSET != 0

165

int8 tmp = left + middle + right;

166

sum0 += tmp;

167

#if CONV_STRIDE_Y == 1

168

sum1 += tmp;

169

#endif /* CONV_STRIDE_Y == 1 */

170

#endif /* WEIGHTS_OFFSET != 0 */

171

172

// Row2

173

GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);

174

values0 += left * (int8)(w2.s0);

175

values0 += middle * (int8)(w2.s1);

176

values0 += right * (int8)(w2.s2);

177

#if CONV_STRIDE_Y == 1

178

values1 += left * (int8)(w1.s0);

179

values1 += middle * (int8)(w1.s1);

180

values1 += right * (int8)(w1.s2);

181

#endif /* CONV_STRIDE_Y == 1 */

182

183

#if WEIGHTS_OFFSET != 0

184

tmp = left + middle + right;

185

sum0 += tmp;

186

#if CONV_STRIDE_Y == 1

187

sum1 += tmp;

188

#endif /* CONV_STRIDE_Y == 1 */

189

#endif /* WEIGHTS_OFFSET != 0 */

190

191

#if CONV_STRIDE_Y == 1

192

// Row3

193

GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);

194

values1 += left * (int8)(w2.s0);

195

values1 += middle * (int8)(w2.s1);

196

values1 += right * (int8)(w2.s2);

197

198

#if WEIGHTS_OFFSET != 0

199

sum1 += left + middle + right;

200

#endif /* WEIGHTS_OFFSET != 0 */

201

#endif /* CONV_STRIDE_Y == 1 */

202

203

#if defined(HAS_BIAS)

204

values0 += (int8)(bias_value);

205

#if CONV_STRIDE_Y == 1

206

values1 += (int8)(bias_value);

207

#endif /* CONV_STRIDE_Y == 1 */

208

#endif //defined(HAS_BIAS)

209

210

#if WEIGHTS_OFFSET != 0

211

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

212

#if CONV_STRIDE_Y == 1

213

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

214

#endif /* CONV_STRIDE_Y == 1 */

215

#endif /* WEIGHTS_OFFSET != 0 */

216

217

#if INPUT_OFFSET != 0

218

ushort sum_weights = 0;

219

ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);

220

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

221

values0 += sum_weights * (int8)(INPUT_OFFSET);

222

#if CONV_STRIDE_Y == 1

223

values1 += sum_weights * (int8)(INPUT_OFFSET);

224

#endif /* CONV_STRIDE_Y == 1 */

225

#endif /* INPUT_OFFSET != 0 */

226

227

#if K_OFFSET != 0

228

values0 += (int8)(K_OFFSET);

229

#if CONV_STRIDE_Y == 1

230

values1 += (int8)(K_OFFSET);

231

#endif /* CONV_STRIDE_Y == 1 */

232

#endif /* K_OFFSET != 0 */

233

234

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

235

values0 += (int8)OUTPUT_OFFSET;

236

uchar8 res0 = convert_uchar8_sat(values0);

237

res0 = max(res0, (uchar8)0);

238

res0 = min(res0, (uchar8)255);

239

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

240

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

241

#if CONV_STRIDE_Y == 1

242

243

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

244

values1 += (int8)OUTPUT_OFFSET;

245

uchar8 res1 = convert_uchar8_sat(values1);

246

res1 = max(res1, (uchar8)0);

247

res1 = min(res1, (uchar8)255);

248

Giorgio Arena

2018-02-16 15:17:23 +0000

[diff] [blame]

249

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

250

#endif /* CONV_STRIDE_Y == 1 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

251

}

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

252

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

253

#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) */

254

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

255

#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

256

257

#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)

258

259

#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)

260

#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

261

#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

262

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

263

#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

264

265

#if WEIGHTS_OFFSET != 0

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

266

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

267

({ \

268

sum += CONVERT(x, VEC_INT); \

269

MULTIPLY_ADD(x, y, acc); \

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

270

})

271

#else /* WEIGHTS_OFFSET != 0 */

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

272

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

273

#endif /* WEIGHTS_OFFSET != 0 */

274

275

/** This function computes the depthwise convolution quantized.

276

*

277

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

278

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

279

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

280

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

281

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

282

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

283

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

284

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

285

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

286

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

287

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

288

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

289

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

290

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

291

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

292

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

293

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

294

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

295

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

296

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

297

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

298

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

299

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

300

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

301

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

302

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

303

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

304

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

305

*/

306

307

__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(

308

TENSOR3D_DECLARATION(src),

309

TENSOR3D_DECLARATION(dst),

310

TENSOR3D_DECLARATION(weights),

311

#if defined(HAS_BIAS)

312

VECTOR_DECLARATION(biases)

313

#endif /* defined(HAS_BIAS) */

314

)

315

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

316

int x = get_global_id(0);

317

int y = get_global_id(1);

318

int z = get_global_id(2);

319

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

320

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

321

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

322

#if defined(HAS_BIAS)

323

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

324

325

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

326

#endif /* defined(HAS_BIAS) */

327

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

328

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x;

329

int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT;

330

int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

331

332

VEC_INT sum_we = 0;

333

VEC_INT acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;

334

VEC_INT sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;

335

336

// z == 0

337

VEC_UCHAR w0, w1, w2;

338

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

339

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

340

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

341

342

#if INPUT_OFFSET != 0

343

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

344

#endif /* INPUT_OFFSET != 0 */

345

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

346

int valid_z = z_coord;

347

int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1

348

valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

349

valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

350

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

351

VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

352

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

353

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

354

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

355

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

356

MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

357

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

358

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

359

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

360

MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);

361

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

362

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

363

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

364

MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);

365

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

366

MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

367

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

368

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

369

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

370

MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);

371

372

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);

373

MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

374

375

weights.ptr += weights_stride_z;

376

377

// z == 1

378

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

379

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

380

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

381

382

#if INPUT_OFFSET != 0

383

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

384

#endif /* INPUT_OFFSET != 0 */

385

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

386

// Only unit pad_top/bottom allowed, this can never be out of bound

387

valid_z = z_coord + 1;

388

valid_y = y_coord;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

389

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

390

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

391

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

392

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

393

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

394

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

395

MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

396

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

397

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

398

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

399

MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);

400

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

401

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

402

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

403

MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);

404

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

405

MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

406

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

407

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

408

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

409

MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);

410

411

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);

412

MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

413

414

weights.ptr += weights_stride_z;

415

416

// z == 2

417

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

418

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

419

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

420

421

#if INPUT_OFFSET != 0

422

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

423

#endif /* INPUT_OFFSET != 0 */

424

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

425

valid_z = z_coord + 2;

426

valid_y = select(y_coord, -1, (int8)valid_z < 0);

427

valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2);

428

valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

429

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

430

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

431

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

432

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

433

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

434

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

435

MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

436

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

437

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

438

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

439

MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);

440

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

441

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

442

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

443

MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);

444

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

445

MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

446

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

447

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

448

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

449

MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);

450

451

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);

452

MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

453

454

#if defined(HAS_BIAS)

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

460

461

#if WEIGHTS_OFFSET != 0

462

acc0 += WEIGHTS_OFFSET * sum0;

463

acc1 += WEIGHTS_OFFSET * sum1;

464

acc2 += WEIGHTS_OFFSET * sum2;

465

acc3 += WEIGHTS_OFFSET * sum3;

466

#endif /* WEIGHTS_OFFSET != 0 */

467

468

#if INPUT_OFFSET != 0

469

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

476

477

#if K_OFFSET != 0

478

acc0 += (VEC_INT)K_OFFSET;

479

acc1 += (VEC_INT)K_OFFSET;

480

acc2 += (VEC_INT)K_OFFSET;

481

acc3 += (VEC_INT)K_OFFSET;

482

#endif /* K_OFFSET != 0 */

483

484

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

485

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

486

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

487

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

488

489

acc0 += (VEC_INT)OUTPUT_OFFSET;

490

acc1 += (VEC_INT)OUTPUT_OFFSET;

491

acc2 += (VEC_INT)OUTPUT_OFFSET;

492

acc3 += (VEC_INT)OUTPUT_OFFSET;

493

494

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

495

VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);

496

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

497

VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);

498

499

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

500

res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);

501

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

502

res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);

503

504

VSTORE(VEC_SIZE)

505

(res0, 0, dst.ptr + 0 * dst_stride_y);

506

VSTORE(VEC_SIZE)

507

(res1, 0, dst.ptr + 1 * dst_stride_y);

508

VSTORE(VEC_SIZE)

509

(res2, 0, dst.ptr + 2 * dst_stride_y);

510

VSTORE(VEC_SIZE)

511

(res3, 0, dst.ptr + 3 * dst_stride_y);

512

}

513

514

/** This function computes the depthwise convolution quantized.

515

*

516

* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8

517

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

518

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

519

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

520

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

521

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

522

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

523

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

524

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8

525

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

526

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

527

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

528

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

529

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

530

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

531

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

532

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

533

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

534

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

535

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

536

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

537

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

538

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

539

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

540

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8

541

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

542

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

543

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

544

*/

545

546

__kernel void depthwise_convolution_3x3_quantized_nhwc_stride2(

547

TENSOR3D_DECLARATION(src),

548

TENSOR3D_DECLARATION(dst),

549

TENSOR3D_DECLARATION(weights),

550

#if defined(HAS_BIAS)

551

VECTOR_DECLARATION(biases)

552

#endif /* defined(HAS_BIAS) */

553

)

554

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

555

int x = get_global_id(0);

556

int y = get_global_id(1);

557

int z = get_global_id(2);

558

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

559

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

560

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

561

#if defined(HAS_BIAS)

562

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

563

564

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

565

#endif /* defined(HAS_BIAS) */

566

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

567

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x;

568

int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT;

569

int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

570

571

VEC_INT sum_we = 0;

572

VEC_INT acc0 = 0, acc2 = 0;

573

VEC_INT sum0 = 0, sum2 = 0;

574

575

// z == 0

576

VEC_UCHAR w0, w1, w2;

577

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

578

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

579

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

580

581

#if INPUT_OFFSET != 0

582

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

583

#endif /* INPUT_OFFSET != 0 */

584

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

585

int valid_z = z_coord;

586

int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1

587

valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2

588

valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

589

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

590

VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

591

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

592

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

593

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

594

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

595

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

596

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

597

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

598

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

599

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

600

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

601

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

602

603

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

604

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

605

606

weights.ptr += weights_stride_z;

607

608

// z == 1

609

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

610

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

611

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

612

613

#if INPUT_OFFSET != 0

614

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

615

#endif /* INPUT_OFFSET != 0 */

616

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

617

// Only unit pad_top/bottom allowed, this can never be out of bound

618

valid_z = z_coord + 1;

619

valid_y = y_coord;

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

620

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

621

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

622

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

623

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

624

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

625

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

626

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

627

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

628

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

629

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

630

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

631

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

632

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

633

634

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

635

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

636

637

weights.ptr += weights_stride_z;

638

639

// z == 2

640

w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);

641

w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);

642

w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);

643

644

#if INPUT_OFFSET != 0

645

sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);

646

#endif /* INPUT_OFFSET != 0 */

647

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

648

valid_z = z_coord + 2;

649

valid_y = select(y_coord, -1, (int8)valid_z < 0);

650

valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2);

651

valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

652

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

653

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);

654

MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

655

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

656

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);

657

MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

658

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

659

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);

660

MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);

661

MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

662

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

663

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);

664

MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);

665

666

values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);

667

MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

668

669

#if defined(HAS_BIAS)

670

acc0 += bias_values;

671

acc2 += bias_values;

672

#endif /* defined(HAS_BIAS) */

673

674

#if WEIGHTS_OFFSET != 0

675

acc0 += WEIGHTS_OFFSET * sum0;

676

acc2 += WEIGHTS_OFFSET * sum2;

677

#endif /* WEIGHTS_OFFSET != 0 */

678

679

#if INPUT_OFFSET != 0

680

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc2 += offs;

#endif /* INPUT_OFFSET != 0 */

685

686

#if K_OFFSET != 0

687

acc0 += (VEC_INT)K_OFFSET;

688

acc2 += (VEC_INT)K_OFFSET;

689

#endif /* K_OFFSET != 0 */

690

691

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

692

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

693

acc0 += (VEC_INT)OUTPUT_OFFSET;

694

acc2 += (VEC_INT)OUTPUT_OFFSET;

695

VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);

696

VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);

697

res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);

698

res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);

699

700

VSTORE(VEC_SIZE)

701

(res0, 0, dst.ptr + 0 * dst_stride_y);

702

VSTORE(VEC_SIZE)

703

(res2, 0, dst.ptr + 1 * dst_stride_y);

704

}

705

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

706

#endif /* defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) */

Giorgio Arena