Blame - src/core/CL/cl_kernels/direct_convolution_quantized.cl - ml/ComputeLibrary

2017-11-03 17:33:15 +0700

[diff] [blame]

1

/*

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

2

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#include "helpers_asymm.h"

25

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

26

#undef CONVERT_SAT_STR

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

27

#undef CONVERT_SAT

28

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

29

#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

30

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

31

#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))

32

#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)

33

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

34

#if defined(DATA_LAYOUT_NHWC)

#if KERNEL_SIZE == 5

#if STRIDE_X == 1

#define CONVOLUTION1x5(acc, src_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1(acc, src_ptr, weights_ptr)

40

#elif STRIDE_X == 2

41

#define CONVOLUTION1x5(acc, src_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2(acc, src_ptr, weights_ptr)

42

#else /* STRIDE_X not equals 1 or 2 */

43

#error "STRIDE_X larger than 2 is not supported"

44

#endif /* STRIDE_X */

45

46

#define CONVOLUTION1x5_STRIDE1(acc, src_ptr, weights_ptr) \

47

({ \

48

int4 weights_values0 = 0; \

49

int weights_value1 = 0; \

50

weights_values0.s0 = convert_int(*(weights_ptr + 0 * weights_stride_y)); \

51

weights_values0.s1 = convert_int(*(weights_ptr + 1 * weights_stride_y)); \

52

weights_values0.s2 = convert_int(*(weights_ptr + 2 * weights_stride_y)); \

53

weights_values0.s3 = convert_int(*(weights_ptr + 3 * weights_stride_y)); \

54

weights_value1 = convert_int(*(weights_ptr + 4 * weights_stride_y)); \

\

int8 src0 = 0; \

int4 src1 = 0; \

src0.s0 = convert_int(*(src_ptr + 0 * weights_stride_y)); \

59

src0.s1 = convert_int(*(src_ptr + 1 * weights_stride_y)); \

60

src0.s2 = convert_int(*(src_ptr + 2 * weights_stride_y)); \

61

src0.s3 = convert_int(*(src_ptr + 3 * weights_stride_y)); \

62

src0.s4 = convert_int(*(src_ptr + 4 * weights_stride_y)); \

63

src0.s5 = convert_int(*(src_ptr + 5 * weights_stride_y)); \

64

src0.s6 = convert_int(*(src_ptr + 6 * weights_stride_y)); \

65

src0.s7 = convert_int(*(src_ptr + 7 * weights_stride_y)); \

66

src1.s0 = convert_int(*(src_ptr + 8 * weights_stride_y)); \

67

src1.s1 = convert_int(*(src_ptr + 9 * weights_stride_y)); \

68

src1.s2 = convert_int(*(src_ptr + 10 * weights_stride_y)); \

69

src1.s3 = convert_int(*(src_ptr + 11 * weights_stride_y)); \

70

\

71

acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

72

acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

73

acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

74

acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

75

acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset); \

76

})

77

78

#define CONVOLUTION1x5_STRIDE2(acc, src_ptr, weights_ptr) \

79

({ \

80

int4 weights_values0 = 0; \

81

int weights_value1 = 0; \

82

weights_values0.s0 = convert_int(*(weights_ptr + 0 * weights_stride_y)); \

83

weights_values0.s1 = convert_int(*(weights_ptr + 1 * weights_stride_y)); \

84

weights_values0.s2 = convert_int(*(weights_ptr + 2 * weights_stride_y)); \

85

weights_values0.s3 = convert_int(*(weights_ptr + 3 * weights_stride_y)); \

86

weights_value1 = convert_int(*(weights_ptr + 4 * weights_stride_y)); \

\

int16 src0 = 0; \

int4 src1 = 0; \

src0.s0 = convert_int(*(src_ptr + 0 * weights_stride_y)); \

91

src0.s1 = convert_int(*(src_ptr + 1 * weights_stride_y)); \

92

src0.s2 = convert_int(*(src_ptr + 2 * weights_stride_y)); \

93

src0.s3 = convert_int(*(src_ptr + 3 * weights_stride_y)); \

94

src0.s4 = convert_int(*(src_ptr + 4 * weights_stride_y)); \

95

src0.s5 = convert_int(*(src_ptr + 5 * weights_stride_y)); \

96

src0.s6 = convert_int(*(src_ptr + 6 * weights_stride_y)); \

97

src0.s7 = convert_int(*(src_ptr + 7 * weights_stride_y)); \

98

src0.s8 = convert_int(*(src_ptr + 8 * weights_stride_y)); \

99

src0.s9 = convert_int(*(src_ptr + 9 * weights_stride_y)); \

100

src0.sa = convert_int(*(src_ptr + 10 * weights_stride_y)); \

101

src0.sb = convert_int(*(src_ptr + 11 * weights_stride_y)); \

102

src0.sc = convert_int(*(src_ptr + 12 * weights_stride_y)); \

103

src0.sd = convert_int(*(src_ptr + 13 * weights_stride_y)); \

104

src0.se = convert_int(*(src_ptr + 14 * weights_stride_y)); \

105

src0.sf = convert_int(*(src_ptr + 15 * weights_stride_y)); \

106

src1.s0 = convert_int(*(src_ptr + 16 * weights_stride_y)); \

107

src1.s1 = convert_int(*(src_ptr + 17 * weights_stride_y)); \

108

src1.s2 = convert_int(*(src_ptr + 18 * weights_stride_y)); \

109

src1.s3 = convert_int(*(src_ptr + 19 * weights_stride_y)); \

110

\

111

acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

112

acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

113

acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

114

acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

115

acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset); \

116

})

117

118

#elif KERNEL_SIZE == 3

119

120

#if STRIDE_X == 1

121

#define CONVOLUTION1x3(acc, src_ptr, weights_ptr) CONVOLUTION1x3_STRIDE1(acc, src_ptr, weights_ptr)

122

#elif STRIDE_X == 2

123

#define CONVOLUTION1x3(acc, src_ptr, weights_ptr) CONVOLUTION1x3_STRIDE2(acc, src_ptr, weights_ptr)

124

#else /* STRIDE_X not equals 1 or 2 */

125

#error "STRIDE_X larger than 2 is not supported"

126

#endif /* STRIDE_X */

127

128

#define CONVOLUTION1x3_STRIDE1(acc, src_ptr, weights_ptr) \

129

({ \

130

int3 weights_values0 = 0; \

131

weights_values0.s0 = convert_int(*(weights_ptr + 0 * weights_stride_y)); \

132

weights_values0.s1 = convert_int(*(weights_ptr + 1 * weights_stride_y)); \

133

weights_values0.s2 = convert_int(*(weights_ptr + 2 * weights_stride_y)); \

\

int8 src0 = 0; \

int2 src1 = 0; \

src0.s0 = convert_int(*(src_ptr + 0 * weights_stride_y)); \

138

src0.s1 = convert_int(*(src_ptr + 1 * weights_stride_y)); \

139

src0.s2 = convert_int(*(src_ptr + 2 * weights_stride_y)); \

140

src0.s3 = convert_int(*(src_ptr + 3 * weights_stride_y)); \

141

src0.s4 = convert_int(*(src_ptr + 4 * weights_stride_y)); \

142

src0.s5 = convert_int(*(src_ptr + 5 * weights_stride_y)); \

143

src0.s6 = convert_int(*(src_ptr + 6 * weights_stride_y)); \

144

src0.s7 = convert_int(*(src_ptr + 7 * weights_stride_y)); \

145

src1.s0 = convert_int(*(src_ptr + 8 * weights_stride_y)); \

146

src1.s1 = convert_int(*(src_ptr + 9 * weights_stride_y)); \

147

\

148

acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

149

acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

150

acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

151

})

152

153

#define CONVOLUTION1x3_STRIDE2(acc, src_ptr, weights_ptr) \

154

({ \

155

int3 weights_values0 = 0; \

156

weights_values0.s0 = convert_int(*(weights_ptr + 0 * weights_stride_y)); \

157

weights_values0.s1 = convert_int(*(weights_ptr + 1 * weights_stride_y)); \

158

weights_values0.s2 = convert_int(*(weights_ptr + 2 * weights_stride_y)); \

\

int16 src0 = 0; \

int src1 = 0; \

src0.s0 = convert_int(*(src_ptr + 0 * src_stride_y)); \

163

src0.s1 = convert_int(*(src_ptr + 1 * src_stride_y)); \

164

src0.s2 = convert_int(*(src_ptr + 2 * src_stride_y)); \

165

src0.s3 = convert_int(*(src_ptr + 3 * src_stride_y)); \

166

src0.s4 = convert_int(*(src_ptr + 4 * src_stride_y)); \

167

src0.s5 = convert_int(*(src_ptr + 5 * src_stride_y)); \

168

src0.s6 = convert_int(*(src_ptr + 6 * src_stride_y)); \

169

src0.s7 = convert_int(*(src_ptr + 7 * src_stride_y)); \

170

src0.s8 = convert_int(*(src_ptr + 8 * src_stride_y)); \

171

src0.s9 = convert_int(*(src_ptr + 9 * src_stride_y)); \

172

src0.sa = convert_int(*(src_ptr + 10 * src_stride_y)); \

173

src0.sb = convert_int(*(src_ptr + 11 * src_stride_y)); \

174

src0.sc = convert_int(*(src_ptr + 12 * src_stride_y)); \

175

src0.sd = convert_int(*(src_ptr + 13 * src_stride_y)); \

176

src0.se = convert_int(*(src_ptr + 14 * src_stride_y)); \

177

src0.sf = convert_int(*(src_ptr + 15 * src_stride_y)); \

178

src1 = convert_int(*(src_ptr + 16 * src_stride_y)); \

179

acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

180

acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

181

acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

182

})

183

184

#elif KERNEL_SIZE == 1

185

186

#if STRIDE_X == 3

187

#define INPUT_VALUE extract_input_stride3

188

#elif STRIDE_X == 2

189

#define INPUT_VALUE extract_input_stride2

190

#elif STRIDE_X == 1

191

#define INPUT_VALUE extract_input_stride1

192

193

#else /* STRIDE_X not equals 1, 2 or 3 */

194

#error "Only support strides 1, 2 and 3"

195

#endif /* STRIDE_X */

196

197

#endif // KERNEL_SIZE == 1

198

199

/** Extracts a 1D horizontal vector from the input tensor with stride as 1.

200

*

201

* @param[in] input_value Pointer to the first value.

202

*

203

* @return extracted input values.

204

*/

205

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value, const uchar stride_y)

206

{

207

VEC_DATA_TYPE(DATA_TYPE, 8)

208

vals;

209

vals.s0 = *(input_value + 0 * stride_y);

210

vals.s1 = *(input_value + 1 * stride_y);

211

vals.s2 = *(input_value + 2 * stride_y);

212

vals.s3 = *(input_value + 3 * stride_y);

213

vals.s4 = *(input_value + 4 * stride_y);

214

vals.s5 = *(input_value + 5 * stride_y);

215

vals.s6 = *(input_value + 6 * stride_y);

216

vals.s7 = *(input_value + 7 * stride_y);

return vals;

}

/** Extracts a 1D horizontal vector from the input tensor with stride as 2.

222

*

223

* @param[in] input_value Pointer to the first value.

224

*

225

* @return extracted input values.

226

*/

227

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value, const uchar stride_y)

228

{

229

VEC_DATA_TYPE(DATA_TYPE, 8)

230

vals;

231

vals.s0 = *(input_value + 0 * stride_y);

232

vals.s1 = *(input_value + 2 * stride_y);

233

vals.s2 = *(input_value + 4 * stride_y);

234

vals.s3 = *(input_value + 6 * stride_y);

235

vals.s4 = *(input_value + 8 * stride_y);

236

vals.s5 = *(input_value + 10 * stride_y);

237

vals.s6 = *(input_value + 12 * stride_y);

238

vals.s7 = *(input_value + 14 * stride_y);

return vals;

}

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.

244

*

245

* @param[in] input_value Pointer to the first value.

246

*

247

* @return extracted input values.

248

*/

249

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value, const uchar stride_y)

250

{

251

VEC_DATA_TYPE(DATA_TYPE, 8)

252

vals;

253

vals.s0 = *(input_value + 0 * stride_y);

254

vals.s1 = *(input_value + 3 * stride_y);

255

vals.s2 = *(input_value + 6 * stride_y);

256

vals.s3 = *(input_value + 9 * stride_y);

257

vals.s4 = *(input_value + 12 * stride_y);

258

vals.s5 = *(input_value + 15 * stride_y);

259

vals.s6 = *(input_value + 18 * stride_y);

260

vals.s7 = *(input_value + 21 * stride_y);

return vals;

}

/** This kernel performs a direct convolution to convolve the low three dimensions.

266

*

267

* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1

268

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

269

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

270

* @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234

271

* @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4

272

*

273

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

274

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

275

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

276

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

277

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

278

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

279

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

280

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

281

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

282

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

283

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

284

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

285

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

286

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

287

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

288

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

289

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

290

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

291

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

292

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

293

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

294

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

295

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

296

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

297

* @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32

298

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

299

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

300

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

301

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

302

* @param[in] input_offset Input offset quantization parameter

303

* @param[in] weight_offset Weights offset quantization parameter

304

* @param[in] output_offset Output offset quantization parameter

305

*/

306

__kernel void direct_convolution_quantized(

307

TENSOR3D_DECLARATION(src),

308

TENSOR3D_DECLARATION(dst),

309

TENSOR3D_DECLARATION(weights),

310

#ifdef HAS_BIAS

311

VECTOR_DECLARATION(biases),

312

#endif /* defined(HAS_BIAS) */

313

unsigned int weights_stride_w,

int input_offset,

int weight_offset,

int output_offset)

{

Image src = CONVERT_TO_IMAGE_STRUCT(src);

319

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

320

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

int8 values0 = 0;

const int y_coord = (get_global_id(2) * STRIDE_Y) - PAD_TOP;

325

326

__global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);

327

__global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0) - src_stride_x * get_global_id(0) + y_coord * (int)src_stride_z;

328

329

const int kernel_index = get_global_id(2);

330

weights_addr += kernel_index * weights_stride_w;

331

332

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

{

#if KERNEL_SIZE == 5

#if(PAD_TOP == 1)

if(y_coord < 0) // special case Z = -1 doesn't exists

337

{

338

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

339

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

340

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

341

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

342

}

343

else if(get_global_id(2) == (DST_HEIGHT - 1))

344

{

345

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

346

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

347

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

348

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

}

else

{

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

353

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

354

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

355

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

356

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

}

#elif(PAD_TOP == 2)

if(y_coord < -1)

{

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

362

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

363

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

364

}

365

else if(y_coord == -1)

366

{

367

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

368

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

369

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

370

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

371

}

372

else if(y_coord == (SRC_HEIGHT - 3))

373

{

374

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

375

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

376

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

377

}

378

else if(y_coord >= (SRC_HEIGHT - 4))

379

{

380

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

381

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

382

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

383

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

}

else

{

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

388

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

389

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

390

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

391

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

392

}

393

#else /* PAD_TOP == 2 */

394

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

395

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

396

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

397

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));

398

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));

399

#endif /* PAD_TOP == 1 */

400

#elif KERNEL_SIZE == 3

401

#if PAD_TOP > 0

402

if(y_coord < 0) // special case Z = -1 doesn't exists

403

{

404

//skip first row and load the two next ones

405

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

406

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

407

}

408

else if(y_coord == (SRC_HEIGHT - PAD_TOP - 1))

409

{

410

// special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the

411

// Z axis has no padding at all.

412

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

413

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

}

else

{

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

418

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

419

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

420

}

421

#else // PAD_TOP > 0

422

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));

423

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));

424

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));

425

#endif // PAD_TOP > 0

426

#elif KERNEL_SIZE == 1

427

int weight = convert_int(*(__global DATA_TYPE *)weights_addr);

428

int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr, src_stride_y));

429

values0 += (input_value + input_offset) * ((int8)weight + weight_offset);

430

#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */

431

432

src_addr += src_stride_x;

433

weights_addr += weights_stride_x;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

438

__global int *bias_addr = ((__global int *)(vector_offset(&biases, get_global_id(0))));

439

values0 += (int8)(*bias_addr);

440

#endif /* defined(HAS_BIAS) */

441

442

#if OUTPUT_SHIFT < 0

443

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

444

#else // OUTPUT_SHIFT < 0

445

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

446

#endif // OUTPUT_SHIFT < 0

447

values0 = values0 + output_offset;

448

449

VEC_DATA_TYPE(DATA_TYPE, 8)

450

values = CONVERT_SAT(values0, DATA_TYPE);

451

*(dst.ptr + 0 * dst_stride_y) = values.s0;

452

*(dst.ptr + 1 * dst_stride_y) = values.s1;

453

*(dst.ptr + 2 * dst_stride_y) = values.s2;

454

*(dst.ptr + 3 * dst_stride_y) = values.s3;

455

*(dst.ptr + 4 * dst_stride_y) = values.s4;

456

*(dst.ptr + 5 * dst_stride_y) = values.s5;

457

*(dst.ptr + 6 * dst_stride_y) = values.s6;

458

*(dst.ptr + 7 * dst_stride_y) = values.s7;

459

}

460

461

#else // defined(DATA_LAYOUT_NHWC)

462

Sang-Hoon Park

2019-10-15 09:29:13 +0100

[diff] [blame]

#if KERNEL_SIZE == 9

#if STRIDE_X == 1

#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)

467

#elif STRIDE_X == 2

468

#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)

469

#else /* STRIDE_X not equals 1 or 2 */

470

#error "STRIDE_X larger than 2 is not supported"

471

#endif /* STRIDE_X */

472

473

#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

474

({ \

475

int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \

476

int weights_value1 = convert_int(*(weights_row_ptr + 8)); \

477

int16 src0 = convert_int16(vload16(0, src_row_ptr)); \

478

acc += (src0.lo + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

479

acc += ((int8)(src0.s1234, src0.s5678) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

480

acc += ((int8)(src0.s2345, src0.s6789) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

481

acc += ((int8)(src0.s3456, src0.s789A) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

482

acc += ((int8)(src0.s4567, src0.s89AB) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \

483

acc += ((int8)(src0.s5678, src0.s9ABC) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \

484

acc += ((int8)(src0.s6789, src0.sABCD) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \

485

acc += ((int8)(src0.s789A, src0.sBCDE) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \

486

acc += ((int8)(src0.s89AB, src0.sCDEF) + input_offset) * ((int8)weights_value1 + weight_offset); \

487

})

488

489

#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

490

({ \

491

int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \

492

int weights_value1 = convert_int(*(weights_row_ptr + 8)); \

493

int16 src0 = convert_int16(vload16(0, src_row_ptr)); \

494

int8 src1 = convert_int8(vload8(0, src_row_ptr + 16)); \

495

acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

496

acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

497

acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

498

acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

499

acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \

500

acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \

501

acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \

502

acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \

503

acc += ((int8)(src0.s8ACE, src1.s0246) + input_offset) * ((int8)weights_value1 + weight_offset); \

504

})

505

506

#elif KERNEL_SIZE == 5

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

507

508

#if STRIDE_X == 1

509

#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)

510

#elif STRIDE_X == 2

511

#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)

512

#else /* STRIDE_X not equals 1 or 2 */

513

#error "STRIDE_X larger than 2 is not supported"

514

#endif /* STRIDE_X */

515

516

#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

517

({ \

518

int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \

519

int weights_value1 = convert_int(*(weights_row_ptr + 4)); \

520

int8 src0 = convert_int8(vload8(0, src_row_ptr)); \

521

int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \

522

acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

523

acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

524

acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

525

acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

526

acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset); \

527

})

528

529

#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

530

({ \

531

int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \

532

int weights_value1 = convert_int(*(weights_row_ptr + 4)); \

533

int16 src0 = convert_int16(vload16(0, src_row_ptr)); \

534

int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \

535

acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

536

acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

537

acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

538

acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \

539

acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset); \

540

})

541

542

#elif KERNEL_SIZE == 3

543

544

#if STRIDE_X == 1

545

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)

546

#elif STRIDE_X == 2

547

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)

548

#else /* STRIDE_X not equals 1 or 2 */

549

#error "STRIDE_X larger than 2 is not supported"

550

#endif /* STRIDE_X */

551

552

#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

553

({ \

554

int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \

555

int8 src0 = convert_int8(vload8(0, src_row_ptr)); \

556

int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \

557

acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

558

acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

559

acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

560

})

561

562

#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

563

({ \

564

int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \

565

int16 src0 = convert_int16(vload16(0, src_row_ptr)); \

566

int src1 = convert_int(*(src_row_ptr + 16)); \

567

acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \

568

acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \

569

acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \

570

})

571

572

#elif KERNEL_SIZE == 1

573

574

#if STRIDE_X == 3

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

575

#define INPUT_VALUE extract_input_stride3

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

576

#elif STRIDE_X == 2

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

577

#define INPUT_VALUE extract_input_stride2

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

578

#elif STRIDE_X == 1

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

579

#define INPUT_VALUE extract_input_stride1

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

580

581

#else /* STRIDE_X not equals 1, 2 or 3 */

582

#error "Only support strides 1, 2 and 3"

583

#endif /* STRIDE_X */

584

585

/** Extracts a 1D horizontal vector from the input tensor with stride as 1.

586

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

587

* @param[in] input_value Pointer to the first value.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

588

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

589

* @return extracted input values.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

590

*/

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

591

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

592

{

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

593

return vload8(0, input_value);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

594

}

595

596

/** Extracts a 1D horizontal vector from the input tensor with stride as 2.

597

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

598

* @param[in] input_value Pointer to the first value.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

599

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

600

* @return extracted input values.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

601

*/

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

602

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

603

{

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

604

VEC_DATA_TYPE(DATA_TYPE, 16)

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

605

temp = vload16(0, input_value);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

606

return temp.s02468ace;

607

}

608

609

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.

610

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

611

* @param[in] input_value Pointer to the first value.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

612

*

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

613

* @return extracted input values.

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

614

*/

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

615

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

616

{

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

617

VEC_DATA_TYPE(DATA_TYPE, 16)

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

618

temp1 = vload16(0, input_value);

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

619

VEC_DATA_TYPE(DATA_TYPE, 16)

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

620

temp2 = vload16(0, input_value + 12);

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

621

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

622

}

623

Sang-Hoon Park

2019-10-15 09:29:13 +0100

[diff] [blame]

624

#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */

625

#error "Only kernel sizes 1, 3, 5 and 9 are supported"

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

626

#endif /* KERNEL_SIZE */

627

628

/** This kernel performs a direct convolution to convolve the low three dimensions.

629

*

630

* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1

631

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

632

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

633

* @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234

634

* @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

635

*

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

636

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

637

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

638

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

639

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

640

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

641

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

642

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

643

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

644

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

645

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

646

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

647

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

648

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

649

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

650

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

651

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

652

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

653

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

654

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

655

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

656

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

657

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

658

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

659

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

Georgios Pinitas

540d008

2017-11-17 10:55:00 +0000

[diff] [blame]

660

* @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

661

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

662

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

663

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

664

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

665

* @param[in] input_offset Input offset quantization parameter

666

* @param[in] weight_offset Weights offset quantization parameter

667

* @param[in] output_offset Output offset quantization parameter

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

668

*/

Sang-Hoon Park

2019-10-15 09:29:13 +0100

[diff] [blame]

669

__kernel void direct_convolution_quantized(

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

670

TENSOR3D_DECLARATION(src),

671

TENSOR3D_DECLARATION(dst),

672

TENSOR3D_DECLARATION(weights),

673

#ifdef HAS_BIAS

674

VECTOR_DECLARATION(biases),

675

#endif /* defined(HAS_BIAS) */

676

unsigned int weights_stride_w,

677

int input_offset,

678

int weight_offset,

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

679

int output_offset)

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

680

{

681

Image src = CONVERT_TO_IMAGE_STRUCT(src);

682

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

683

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

684

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

685

int8 values0 = 0;

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

686

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

687

__global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);

688

__global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

689

690

const int kernel_index = get_global_id(2);

691

weights_addr += kernel_index * weights_stride_w;

692

693

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

694

{

Sang-Hoon Park

2019-10-15 09:29:13 +0100

[diff] [blame]

695

#if KERNEL_SIZE == 9

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

696

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));

697

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

698

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

699

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));

700

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));

701

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));

702

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));

703

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));

704

CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));

Sang-Hoon Park

2019-10-15 09:29:13 +0100

[diff] [blame]

705

#elif KERNEL_SIZE == 5

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

706

CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);

707

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

708

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

709

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));

710

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

711

#elif KERNEL_SIZE == 3

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

712

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));

713

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

714

CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

715

#elif KERNEL_SIZE == 1

Sheri Zhang

2020-02-20 11:23:08 +0000

[diff] [blame]

716

int weight = convert_int(*(__global DATA_TYPE *)weights_addr);

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

717

int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));

718

values0 += (input_value + input_offset) * ((int8)weight + weight_offset);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

719

#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */

720

721

src_addr += src_stride_z;

722

weights_addr += weights_stride_z;

723

}

724

725

#ifdef HAS_BIAS

Georgios Pinitas

540d008

2017-11-17 10:55:00 +0000

[diff] [blame]

726

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

727

__global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

728

values0 += (int8)(*bias_addr);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

729

#endif /* defined(HAS_BIAS) */

730

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

731

#if OUTPUT_SHIFT < 0

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

732

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

733

#else // OUTPUT_SHIFT < 0

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

734

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

735

#endif // OUTPUT_SHIFT < 0

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

736

values0 = values0 + output_offset;

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

737

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

738

vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);

Chunosov

2017-11-03 17:33:15 +0700

[diff] [blame]

739

}

Giorgio Arena

2020-02-17 16:33:20 +0000

[diff] [blame^]

740

741

#endif // defined(DATA_LAYOUT_NHWC)

Michele Di Giorgio