Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2019-10-09 15:32:39 +0100

[diff] [blame]

51

52

#if defined(WEIGHTS_PROMOTED_TYPE)

53

#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

54

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

55

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

56

#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

57

#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

58

#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

59

#define ARM_DOT(x, y, val) val += arm_dot((x), (y));

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

60

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)

61

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

62

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

63

#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

64

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

65

#if CONV_STRIDE_X > 3

66

#error "Stride X not supported"

67

#endif /* CONV_STRIDE_X > 3 */

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

68

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

69

#if !defined(IS_DOT8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

70

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

71

#if DILATION_X == 1

72

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

73

#if CONV_STRIDE_X == 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

74

#define GET_VALUES(first_value, left, middle, right) \

75

({ \

76

int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \

77

int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

78

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

79

left = CONVERT(temp0.s01234567, int8); \

80

middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \

81

right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

82

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

83

#elif CONV_STRIDE_X == 2

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

84

#define GET_VALUES(first_value, left, middle, right) \

85

({ \

86

int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \

87

int temp1 = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

88

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

89

left = CONVERT(temp0.s02468ace, int8); \

90

middle = CONVERT(temp0.s13579bdf, int8); \

91

right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

92

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

93

#else /* CONV_STRIDE_X */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

94

#define GET_VALUES(first_value, left, middle, right) \

95

({ \

96

int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \

97

int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

98

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

99

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

100

middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \

101

right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

102

})

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

103

#endif /* CONV_STRIDE_X */

104

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

105

#else /* DILATION_X == 1 */

106

107

#if CONV_STRIDE_X == 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

108

#define GET_VALUES(first_value, left, middle, right) \

109

({ \

110

left = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \

111

middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8); \

112

right = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

113

})

114

#elif CONV_STRIDE_X == 2

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

115

#define GET_VALUES(first_value, left, middle, right) \

116

({ \

117

int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \

118

left = CONVERT(temp0.s02468ace, int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

119

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

120

temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \

121

middle = CONVERT(temp0.s02468ace, int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

122

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

123

temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \

124

right = CONVERT(temp0.s02468ace, int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

125

})

126

#else /* CONV_STRIDE_X */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

127

#define GET_VALUES(first_value, left, middle, right) \

128

({ \

129

int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \

130

int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \

131

left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

132

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

133

temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \

134

temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8); \

135

middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

136

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

137

temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \

138

temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \

139

right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

140

})

141

142

#endif /* CONV_STRIDE_X */

143

#endif /* DILATION_X==1 */

144

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

145

/** This function computes the depthwise convolution quantized.

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

146

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

147

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

148

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

149

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

150

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

151

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

152

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

153

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

154

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

155

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

156

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

157

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

158

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

159

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

160

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

161

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

162

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

163

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

164

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

165

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

166

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

167

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

168

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

169

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

170

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

171

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

172

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

173

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

174

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

175

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

176

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

177

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

178

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

179

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

180

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

181

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

182

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Anthony Barbier

f202e50

2017-11-23 18:02:04 +0000

[diff] [blame]

183

*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

184

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

185

__kernel void dwc_3x3_native_quantized8_nchw(

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

186

TENSOR3D_DECLARATION(src),

187

TENSOR3D_DECLARATION(dst),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

188

TENSOR3D_DECLARATION(weights),

189

VECTOR_DECLARATION(output_multipliers),

190

VECTOR_DECLARATION(output_shifts)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

191

#if defined(HAS_BIAS)

Georgios Pinitas

5b2191e

2018-02-22 12:56:51 +0000

[diff] [blame]

192

,

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

193

VECTOR_DECLARATION(biases)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

194

#endif //defined(HAS_BIAS)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

195

)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

196

{

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

197

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

198

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

199

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

200

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);

201

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

202

203

// Extract channel and linearized batch indices

204

const int channel = get_global_id(2) % DST_CHANNELS;

205

const int batch = get_global_id(2) / DST_CHANNELS;

206

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

207

#if defined(HAS_BIAS)

208

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

209

Georgios Pinitas

728d3cf

2018-09-21 13:41:35 +0100

[diff] [blame]

210

int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

211

#endif //defined(HAS_BIAS)

212

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

213

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

214

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

215

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

7657224

2018-04-04 17:44:26 +0100

[diff] [blame]

216

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

217

VEC_DATA_TYPE(WEIGHTS_TYPE, 3)

218

w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));

219

VEC_DATA_TYPE(WEIGHTS_TYPE, 3)

220

w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));

221

VEC_DATA_TYPE(WEIGHTS_TYPE, 3)

222

w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));

223

224

#if defined(PER_CHANNEL_QUANTIZATION)

225

const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));

226

const int output_shift = *((__global int *)vector_offset(&output_shifts, channel));

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

227

#endif // defined(PER_CHANNEL_QUANTIZATION)

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

228

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

229

int8 values0 = 0;

230

int8 sum0 = 0;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

231

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

232

int8 values1 = 0;

233

int8 sum1 = 0;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

234

#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

235

236

// Row0

237

int8 left, middle, right;

238

GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);

239

values0 += left * (int8)(w0.s0);

240

values0 += middle * (int8)(w0.s1);

241

values0 += right * (int8)(w0.s2);

242

243

#if WEIGHTS_OFFSET != 0

244

sum0 += left + middle + right;

245

#endif /* WEIGHTS_OFFSET != 0 */

246

247

// Row1

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

248

GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

249

values0 += left * (int8)(w1.s0);

250

values0 += middle * (int8)(w1.s1);

251

values0 += right * (int8)(w1.s2);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

252

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

253

values1 += left * (int8)(w0.s0);

254

values1 += middle * (int8)(w0.s1);

255

values1 += right * (int8)(w0.s2);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

256

#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

257

258

#if WEIGHTS_OFFSET != 0

259

int8 tmp = left + middle + right;

260

sum0 += tmp;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

261

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

262

sum1 += tmp;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

263

#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

264

#endif /* WEIGHTS_OFFSET != 0 */

265

266

// Row2

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

267

GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

268

values0 += left * (int8)(w2.s0);

269

values0 += middle * (int8)(w2.s1);

270

values0 += right * (int8)(w2.s2);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

271

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

272

values1 += left * (int8)(w1.s0);

273

values1 += middle * (int8)(w1.s1);

274

values1 += right * (int8)(w1.s2);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

275

#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

276

277

#if WEIGHTS_OFFSET != 0

278

tmp = left + middle + right;

279

sum0 += tmp;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

280

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

281

sum1 += tmp;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

282

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

283

#endif /* WEIGHTS_OFFSET != 0 */

284

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

285

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

286

// Row3

287

GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);

288

values1 += left * (int8)(w2.s0);

289

values1 += middle * (int8)(w2.s1);

290

values1 += right * (int8)(w2.s2);

291

292

#if WEIGHTS_OFFSET != 0

293

sum1 += left + middle + right;

294

#endif /* WEIGHTS_OFFSET != 0 */

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

295

#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

296

297

#if defined(HAS_BIAS)

298

values0 += (int8)(bias_value);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

299

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

300

values1 += (int8)(bias_value);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

301

#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

302

#endif //defined(HAS_BIAS)

303

304

#if WEIGHTS_OFFSET != 0

305

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

306

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

307

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

308

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

309

#endif /* WEIGHTS_OFFSET != 0 */

310

311

#if INPUT_OFFSET != 0

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

312

VEC_WEIGHTS_PROMOTED_TYPE(3)

313

tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));

314

315

WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

316

values0 += sum_weights * (int8)(INPUT_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

317

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

318

values1 += sum_weights * (int8)(INPUT_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

319

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

320

#endif /* INPUT_OFFSET != 0 */

321

322

#if K_OFFSET != 0

323

values0 += (int8)(K_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

324

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

325

values1 += (int8)(K_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

326

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

327

#endif /* K_OFFSET != 0 */

328

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

329

#if defined(REAL_MULTIPLIER)

330

331

values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);

332

333

#else // defined(REAL_MULTIPLIER)

334

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

335

#if defined(PER_CHANNEL_QUANTIZATION)

336

int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);

337

int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);

338

values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);

339

#else // defined(PER_CHANNEL_QUANTIZATION)

340

#if OUTPUT_SHIFT < 0

341

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

342

#else // OUTPUT_SHIFT < 0

343

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

344

#endif // OUTPUT_OFFSET < 0

345

#endif // defined(PER_CHANNEL_QUANTIZATION)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

346

347

#endif // defined(REAL_MULTIPLIER)

348

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

349

values0 += (int8)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

350

VEC_TYPE(8)

351

res0 = CONVERT_SAT(values0, VEC_TYPE(8));

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

352

Giorgio Arena

99ac60b

2018-02-16 15:17:23 +0000

[diff] [blame]

353

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

354

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

355

#if defined(REAL_MULTIPLIER)

356

357

values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);

358

359

#else // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

360

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

361

#if defined(PER_CHANNEL_QUANTIZATION)

362

int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);

363

int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);

364

values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);

365

#else // defined(PER_CHANNEL_QUANTIZATION)

366

#if OUTPUT_SHIFT < 0

367

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

368

#else // OUTPUT_SHIFT < 0

369

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

370

#endif // OUTPUT_OFFSET < 0

371

#endif // defined(PER_CHANNEL_QUANTIZATION)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

372

373

#endif // defined(REAL_MULTIPLIER)

374

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

375

values1 += (int8)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

376

VEC_TYPE(8)

377

res1 = CONVERT_SAT(values1, VEC_TYPE(8));

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

378

Giorgio Arena

99ac60b

2018-02-16 15:17:23 +0000

[diff] [blame]

379

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

380

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/

Dmitry Savenko

2017-11-20 22:00:08 +0700

[diff] [blame]

381

}

Giorgio Arena

2018-02-16 11:01:04 +0000

[diff] [blame]

382

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

383

#else // !defined(IS_DOT8)

384

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

385

#if DILATION_X == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

386

#if CONV_STRIDE_X == 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

387

#define GET_VALUES(first_value, left, middle, right) \

388

({ \

389

VEC_TYPE(8) \

390

temp0 = vload8(0, (__global DATA_TYPE *)(first_value)); \

391

VEC_TYPE(2) \

392

temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

393

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

394

left = temp0.s01234567; \

395

middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0); \

396

right = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01); \

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

397

})

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

398

#elif CONV_STRIDE_X == 2

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

399

#define GET_VALUES(first_value, left, middle, right) \

400

({ \

401

VEC_TYPE(16) \

402

temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \

403

DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

404

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

405

left = temp0.s02468ace; \

406

middle = temp0.s13579bdf; \

407

right = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

408

})

409

#else /* CONV_STRIDE_X */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

410

#define GET_VALUES(first_value, left, middle, right) \

411

({ \

412

VEC_TYPE(16) \

413

temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \

414

VEC_TYPE(8) \

415

temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

416

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

417

left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \

418

middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036); \

419

right = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147); \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

420

})

421

#endif /* CONV_STRIDE_X */

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

422

#else /*DILATION_X==1*/

423

424

#if CONV_STRIDE_X == 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

425

#define GET_VALUES(first_value, left, middle, right) \

426

({ \

427

left = vload8(0, (__global DATA_TYPE *)(first_value)); \

428

middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \

429

right = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

430

})

431

#elif CONV_STRIDE_X == 2

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

432

#define GET_VALUES(first_value, left, middle, right) \

433

({ \

434

VEC_TYPE(16) \

435

temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \

436

left = temp0.s02468ace; \

437

temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \

438

middle = temp0.s02468ace; \

439

temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \

440

right = temp0.s02468ace; \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

441

})

442

#else /* CONV_STRIDE_X */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

443

#define GET_VALUES(first_value, left, middle, right) \

444

({ \

445

VEC_TYPE(16) \

446

temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \

447

VEC_TYPE(8) \

448

temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE)))); \

449

left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

450

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

451

temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \

452

temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))); \

453

middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

454

\

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

455

temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \

456

temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \

457

right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

458

})

459

460

#endif /* CONV_STRIDE_X */

461

#endif /*DILATION_X==1*/

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

462

/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

463

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

464

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

465

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

466

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

467

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

468

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

469

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

470

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

471

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

472

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

473

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

474

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

475

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

476

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

477

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

478

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

479

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

480

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

481

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

482

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

483

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

484

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

485

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

486

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

487

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

488

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

489

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

490

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

491

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

492

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

493

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

494

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

495

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

496

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

497

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

498

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

499

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

500

*/

501

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

502

__kernel void dwc_3x3_native_quantized8_dot8_nchw(

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

503

TENSOR3D_DECLARATION(src),

504

TENSOR3D_DECLARATION(dst),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

505

TENSOR3D_DECLARATION(weights),

506

VECTOR_DECLARATION(output_multipliers),

507

VECTOR_DECLARATION(output_shifts)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

508

#if defined(HAS_BIAS)

509

,

510

VECTOR_DECLARATION(biases)

511

#endif //defined(HAS_BIAS)

512

)

513

{

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

514

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

515

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

516

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

517

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);

518

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

519

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

520

// Extract channel and linearized batch indices

521

const int channel = get_global_id(2) % DST_CHANNELS;

522

const int batch = get_global_id(2) / DST_CHANNELS;

523

524

#if defined(HAS_BIAS)

525

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

526

527

const int bias_value = *((__global int *)(vector_offset(&biases, channel)));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

528

#endif //defined(HAS_BIAS)

529

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

530

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

531

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

532

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

533

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

534

VEC_TYPE(3)

535

w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));

536

VEC_TYPE(3)

537

w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));

538

VEC_TYPE(3)

539

w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

540

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

541

const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));

542

const int output_shift = *((__global int *)vector_offset(&output_shifts, 0));

543

544

VEC_TYPE(8)

545

left0, middle0, right0;

546

VEC_TYPE(8)

547

left1, middle1, right1;

548

VEC_TYPE(8)

549

left2, middle2, right2;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

int8 values0 = 0;

int8 sum0 = 0;

GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

555

GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);

556

GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

557

558

#if WEIGHTS_OFFSET != 0

559

sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);

560

sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

561

sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

562

#endif /* WEIGHTS_OFFSET != 0 */

563

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

564

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

565

// If conv_stride_y is equals to 1, we compute two output rows

566

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

567

VEC_TYPE(8)

568

left3, middle3, right3;

569

int8 values1 = 0;

570

int8 sum1 = 0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

571

572

GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);

573

574

#if WEIGHTS_OFFSET != 0

575

sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);

576

sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);

577

sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);

578

#endif /* WEIGHTS_OFFSET != 0 */

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

579

#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

580

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

581

ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);

582

ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

583

values0.s0 += right2.s0 * w2.s2;

584

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

585

ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);

586

ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

587

values0.s1 += right2.s1 * w2.s2;

588

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

589

ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);

590

ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

591

values0.s2 += right2.s2 * w2.s2;

592

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

593

ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);

594

ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

595

values0.s3 += right2.s3 * w2.s2;

596

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

597

ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);

598

ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

599

values0.s4 += right2.s4 * w2.s2;

600

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

601

ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);

602

ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

603

values0.s5 += right2.s5 * w2.s2;

604

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

605

ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);

606

ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

607

values0.s6 += right2.s6 * w2.s2;

608

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

609

ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);

610

ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

611

values0.s7 += right2.s7 * w2.s2;

612

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

613

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

614

ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);

615

ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

616

values1.s0 += right3.s0 * w2.s2;

617

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

618

ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);

619

ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

620

values1.s1 += right3.s1 * w2.s2;

621

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

622

ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);

623

ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

624

values1.s2 += right3.s2 * w2.s2;

625

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

626

ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);

627

ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

628

values1.s3 += right3.s3 * w2.s2;

629

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

630

ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);

631

ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

632

values1.s4 += right3.s4 * w2.s2;

633

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

634

ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);

635

ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

636

values1.s5 += right3.s5 * w2.s2;

637

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

638

ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);

639

ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

640

values1.s6 += right3.s6 * w2.s2;

641

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

642

ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);

643

ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

644

values1.s7 += right3.s7 * w2.s2;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

645

#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

646

647

#if defined(HAS_BIAS)

648

values0 += (int8)(bias_value);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

649

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

650

values1 += (int8)(bias_value);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

651

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

652

#endif //defined(HAS_BIAS)

653

654

#if WEIGHTS_OFFSET != 0

655

values0 += sum0 * (int8)(WEIGHTS_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

656

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

657

values1 += sum1 * (int8)(WEIGHTS_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

658

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

659

#endif /* WEIGHTS_OFFSET != 0 */

660

661

#if INPUT_OFFSET != 0

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

662

WEIGHTS_PROMOTED_TYPE sum_weights = 0;

663

VEC_WEIGHTS_PROMOTED_TYPE(3)

664

tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

665

sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;

666

values0 += sum_weights * (int8)(INPUT_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

667

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

668

values1 += sum_weights * (int8)(INPUT_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

669

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

670

#endif /* INPUT_OFFSET != 0 */

671

672

#if K_OFFSET != 0

673

values0 += (int8)(K_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

674

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

675

values1 += (int8)(K_OFFSET);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

676

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

677

#endif /* K_OFFSET != 0 */

678

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

679

#if defined(REAL_MULTIPLIER)

680

681

values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);

682

683

#else // defined(REAL_MULTIPLIER)

684

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

685

#if defined(PER_CHANNEL_QUANTIZATION)

686

int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);

687

int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);

688

values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);

689

#else // defined(PER_CHANNEL_QUANTIZATION)

690

#if OUTPUT_SHIFT < 0

691

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

692

#else // OUTPUT_SHIFT < 0

693

values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

694

#endif // OUTPUT_OFFSET < 0

695

#endif // defined(PER_CHANNEL_QUANTIZATION)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

696

697

#endif // defined(REAL_MULTIPLIER)

698

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

699

values0 += (int8)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

700

VEC_TYPE(8)

701

res0 = CONVERT_SAT(values0, VEC_TYPE(8));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

702

703

vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

704

#if CONV_STRIDE_Y == 1 && DILATION_Y == 1

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

705

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

706

#if defined(REAL_MULTIPLIER)

707

708

values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);

709

710

#else // defined(REAL_MULTIPLIER)

711

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

712

#if defined(PER_CHANNEL_QUANTIZATION)

713

int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);

714

int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);

715

values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);

716

#else // defined(PER_CHANNEL_QUANTIZATION)

717

#if OUTPUT_SHIFT < 0

718

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

719

#else // OUTPUT_SHIFT < 0

720

values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);

721

#endif // OUTPUT_OFFSET < 0

722

#endif // defined(PER_CHANNEL_QUANTIZATION)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

723

724

#endif // defined(REAL_MULTIPLIER)

725

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

726

values1 += (int8)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

727

VEC_TYPE(8)

728

res1 = CONVERT_SAT(values1, VEC_TYPE(8));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

729

730

vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

731

#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

732

}

733

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

734

#endif // !defined(IS_DOT8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

735

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

736

#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

737

738

#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

739

740

#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)

741

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

742

#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

743

744

#if WEIGHTS_OFFSET != 0

745

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

746

({ \

747

sum += CONVERT(x, VEC_INT); \

748

MULTIPLY_ADD(x, y, acc); \

749

})

750

#else /* WEIGHTS_OFFSET != 0 */

751

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

752

#endif /* WEIGHTS_OFFSET != 0 */

753

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

754

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

755

#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \

756

({ \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

757

ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \

758

ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

759

acc += val8 * w1; \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

760

})

761

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

762

#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \

763

({ \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

764

sum = val0; \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

765

ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \

766

ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

767

})

768

769

#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \

770

({ \

771

sum = w1; \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

772

ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \

773

ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

774

})

775

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

776

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

777

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

778

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

779

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.

780

*

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

781

* @note This kernel assumes VEC_SIZE is 4.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

782

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

783

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

784

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

785

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

786

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

787

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

788

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

789

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

790

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

791

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

792

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

793

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

794

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

795

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

796

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

797

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

798

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

799

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

800

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

801

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

802

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

803

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

804

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

805

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

806

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

807

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

808

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

809

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

810

* @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

811

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

812

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

813

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

814

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

815

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

816

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

817

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

818

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

819

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

820

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

821

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

822

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

823

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

824

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

825

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

826

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

827

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

828

* @param[in] max_offset Max offset for the input tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

829

*/

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

830

__kernel void dwc_3x3_reshaped_quantized8_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

831

TENSOR4D_DECLARATION(src),

832

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

833

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

834

VECTOR_DECLARATION(output_multipliers),

835

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

836

#if defined(HAS_BIAS)

837

VECTOR_DECLARATION(biases),

838

#endif /* defined(HAS_BIAS) */

839

int max_offset)

840

{

841

const int x = get_global_id(0); // channels

842

const int y = get_global_id(1); // spatial coordinate x

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

843

#if defined(DST_DEPTH)

844

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

845

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

846

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

847

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

848

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

849

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

850

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

851

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

852

#if defined(DST_DEPTH)

853

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

854

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

855

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

856

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

857

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

858

int z_coord = 0;

859

int4 offset = 0;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

860

int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

861

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

862

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

863

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

864

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

865

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

866

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

867

868

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

869

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

870

// We compute VEC_SIZEx1x1 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

871

VEC_INT acc = 0, sum = 0;

872

873

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

874

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

875

w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

876

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

877

w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

878

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

879

w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

880

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

881

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

882

w0 = w0_tmp.s0123;

883

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

884

w1 = w0_tmp.s4567;

885

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

886

w2 = w0_tmp.s89AB;

887

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

888

w3 = w0_tmp.sCDEF;

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

889

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

890

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

891

w4 = w1_tmp.s0123;

892

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

893

w5 = w1_tmp.s4567;

894

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

895

w6 = w1_tmp.s89AB;

896

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

897

w7 = w1_tmp.sCDEF;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

898

899

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

900

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

901

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

902

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

903

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

908

// z_coord is casted to unsigned int in order to use just a min() operation

909

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

910

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

911

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

912

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

913

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

914

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

915

VEC_TYPE(VEC_SIZE)

916

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

917

VEC_TYPE(VEC_SIZE)

918

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

919

VEC_TYPE(VEC_SIZE)

920

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

921

922

// z == 1

923

// z_coord can be only negative for z = 0 so we do not need to clamp it

924

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

925

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;

926

offset = y_offset + (int4)(z_coord * src_stride_z);

927

VEC_TYPE(VEC_SIZE)

928

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

929

VEC_TYPE(VEC_SIZE)

930

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

931

VEC_TYPE(VEC_SIZE)

932

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

933

934

// z == 2

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame]

935

// Offset can be out-of-bound so we need to check if it is greater than max_offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

936

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;

937

offset = y_offset + (int4)(z_coord * src_stride_z);

938

offset = min(offset, (int4)max_offset);

939

VEC_TYPE(VEC_SIZE)

940

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

941

VEC_TYPE(VEC_SIZE)

942

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

943

VEC_TYPE(VEC_SIZE)

944

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

945

946

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);

947

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);

948

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);

949

950

MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);

951

MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);

952

MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);

953

954

MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);

955

MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);

956

MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);

957

958

#if defined(HAS_BIAS)

959

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

960

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

961

acc += bias_values;

962

#endif // defined(HAS_BIAS)

963

964

#if WEIGHTS_OFFSET != 0

965

acc += WEIGHTS_OFFSET * sum;

966

#endif /* WEIGHTS_OFFSET != 0 */

967

968

#if INPUT_OFFSET != 0

969

acc += INPUT_OFFSET * sum_we;

970

#endif /* INPUT_OFFSET != 0 */

971

972

#if K_OFFSET != 0

973

acc += (VEC_INT)K_OFFSET;

974

#endif /* K_OFFSET != 0 */

975

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

976

#if defined(REAL_MULTIPLIER)

977

978

acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

979

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

980

#else // defined(REAL_MULTIPLIER)

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

981

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

982

#if defined(PER_CHANNEL_QUANTIZATION)

983

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);

984

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);

985

VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);

986

VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

987

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

988

VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);

989

VEC_INT res_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);

990

acc = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);

991

#else // defined(PER_CHANNEL_QUANTIZATION)

992

#if OUTPUT_SHIFT < 0

993

acc = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

994

#else // OUTPUT_SHIFT < 0

995

acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

996

#endif // OUTPUT_SHIFT < 0

997

#endif // defined(PER_CHANNEL_QUANTIZATION)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

998

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

999

#endif // defined(REAL_MULTIPLIER)

1000

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1001

acc += (VEC_INT)OUTPUT_OFFSET;

1002

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1003

VEC_TYPE(VEC_SIZE)

1004

res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1005

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1006

#if defined(DST_DEPTH)

1007

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1008

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1009

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1010

#endif /* defined(DST_DEPTH) */

1011

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1012

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1013

(ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1014

}

1015

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

1016

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1017

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1018

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1019

*

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1020

* @note This kernel assumes VEC_SIZE is 4.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1021

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1022

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1023

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1024

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1025

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1026

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1027

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

1028

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

1029

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1030

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1031

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1032

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1033

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1034

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1035

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1036

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1037

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1038

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1039

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1040

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1041

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1042

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1043

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1044

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1045

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1046

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1047

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1048

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

1049

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1050

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1051

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1052

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1053

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1054

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1055

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1056

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1057

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1058

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1059

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1060

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1061

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1062

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1063

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1064

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1065

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1066

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1067

* @param[in] max_offset Max offset for the input tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1068

*/

1069

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1070

__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1071

TENSOR4D_DECLARATION(src),

1072

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1073

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1074

VECTOR_DECLARATION(output_multipliers),

1075

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1076

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1077

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1078

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1079

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1080

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1081

int x = get_global_id(0);

1082

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1083

#if defined(DST_DEPTH)

1084

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1085

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1086

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1087

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1088

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1089

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1090

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1091

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1092

#if defined(DST_DEPTH)

1093

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

1094

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1095

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1096

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1097

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1098

int z_coord = 0;

1099

int4 offset = 0;

1100

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1101

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1102

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

1103

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

1104

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

1105

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

1106

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

1107

1108

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

1109

1110

// We compute 4x2x2 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1111

VEC_INT acc0 = 0, sum0 = 0;

1112

VEC_INT acc1 = 0, sum1 = 0;

1113

VEC_INT acc2 = 0, sum2 = 0;

1114

VEC_INT acc3 = 0, sum3 = 0;

1115

1116

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1117

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

1118

w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

1119

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

1120

w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

1121

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1122

w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1123

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1124

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1125

w0 = w0_tmp.s0123;

1126

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1127

w1 = w0_tmp.s4567;

1128

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1129

w2 = w0_tmp.s89AB;

1130

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1131

w3 = w0_tmp.sCDEF;

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1132

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1133

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1134

w4 = w1_tmp.s0123;

1135

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1136

w5 = w1_tmp.s4567;

1137

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1138

w6 = w1_tmp.s89AB;

1139

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1140

w7 = w1_tmp.sCDEF;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1141

1142

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1143

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

1144

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

1145

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1146

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1151

// z_coord is casted to unsigned int in order to use just a min() operation

1152

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1153

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1154

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1155

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1156

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1157

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1158

VEC_TYPE(VEC_SIZE)

1159

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1160

VEC_TYPE(VEC_SIZE)

1161

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1162

VEC_TYPE(VEC_SIZE)

1163

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1164

VEC_TYPE(VEC_SIZE)

1165

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1166

1167

// z == 1

1168

// z_coord can be only negative for z = 0 so we do not need to clamp it

1169

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1170

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

1171

offset = y_offset + (int4)(z_coord * src_stride_z);

1172

VEC_TYPE(VEC_SIZE)

1173

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1174

VEC_TYPE(VEC_SIZE)

1175

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1176

VEC_TYPE(VEC_SIZE)

1177

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1178

VEC_TYPE(VEC_SIZE)

1179

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1180

1181

// z == 2

1182

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1183

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1184

offset += (int4)src_stride_z;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1185

offset = min(offset, (int4)max_offset);

1186

VEC_TYPE(VEC_SIZE)

1187

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1188

VEC_TYPE(VEC_SIZE)

1189

values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1190

VEC_TYPE(VEC_SIZE)

1191

values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1192

VEC_TYPE(VEC_SIZE)

1193

values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1194

1195

// z == 3

1196

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1197

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1198

offset += (int4)(src_stride_z);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1199

offset = min(offset, (int4)max_offset);

1200

VEC_TYPE(VEC_SIZE)

1201

values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1202

VEC_TYPE(VEC_SIZE)

1203

values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1204

VEC_TYPE(VEC_SIZE)

1205

values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1206

VEC_TYPE(VEC_SIZE)

1207

values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1208

1209

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);

1210

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);

1211

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);

1212

MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);

1213

MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);

1214

MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);

1215

1216

MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);

1217

MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);

1218

MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);

1219

MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);

1220

MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);

1221

MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);

1222

1223

MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);

1224

MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);

1225

MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);

1226

MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);

1227

MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);

1228

MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);

1229

1230

MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);

1231

MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);

1232

MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);

1233

MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);

1234

MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);

1235

MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);

1236

1237

MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);

1238

MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);

1239

MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);

1240

MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);

1241

MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);

1242

MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);

1243

1244

MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);

1245

MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);

1246

MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);

1247

MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);

1248

MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);

1249

MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);

1250

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1251

#if defined(HAS_BIAS)

1252

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1253

1254

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1255

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

1261

1262

#if WEIGHTS_OFFSET != 0

1263

acc0 += WEIGHTS_OFFSET * sum0;

1264

acc1 += WEIGHTS_OFFSET * sum1;

1265

acc2 += WEIGHTS_OFFSET * sum2;

1266

acc3 += WEIGHTS_OFFSET * sum3;

1267

#endif /* WEIGHTS_OFFSET != 0 */

1268

1269

#if INPUT_OFFSET != 0

1270

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

1277

1278

#if K_OFFSET != 0

1279

acc0 += (VEC_INT)K_OFFSET;

1280

acc1 += (VEC_INT)K_OFFSET;

1281

acc2 += (VEC_INT)K_OFFSET;

1282

acc3 += (VEC_INT)K_OFFSET;

1283

#endif /* K_OFFSET != 0 */

1284

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1285

#if defined(REAL_MULTIPLIER)

1286

1287

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1288

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1289

acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1290

acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1291

1292

#else // defined(REAL_MULTIPLIER)

1293

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1294

#if defined(PER_CHANNEL_QUANTIZATION)

1295

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);

1296

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);

1297

VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);

1298

VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1299

Sang-Hoon Park

4d79d4b

2020-01-30 14:38:37 +0000

[diff] [blame]

1300

VEC_INT res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);

1301

VEC_INT res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);

1302

VEC_INT res2_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);

1303

VEC_INT res3_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);

1304

VEC_INT res0_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);

1305

VEC_INT res1_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);

1306

VEC_INT res2_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);

1307

VEC_INT res3_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1308

acc0 = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);

1309

acc1 = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);

1310

acc2 = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);

1311

acc3 = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);

1312

#else // defined(PER_CHANNEL_QUANTIZATION)

1313

#if OUTPUT_SHIFT < 0

Michele Di Giorgio

b598afa

2019-12-20 16:22:20 +0000

[diff] [blame]

1314

acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

1315

acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

1316

acc2 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

1317

acc3 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1318

#else // OUTPUT_SHIFT < 0

Michele Di Giorgio

b598afa

2019-12-20 16:22:20 +0000

[diff] [blame]

1319

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1320

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1321

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1322

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1323

#endif // OUTPUT_SHIFT < 0

1324

#endif // defined(PER_CHANNEL_QUANTIZATION)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1325

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1326

#endif // defined(REAL_MULTIPLIER)

1327

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1328

acc0 += (VEC_INT)OUTPUT_OFFSET;

1329

acc1 += (VEC_INT)OUTPUT_OFFSET;

1330

acc2 += (VEC_INT)OUTPUT_OFFSET;

1331

acc3 += (VEC_INT)OUTPUT_OFFSET;

1332

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1333

VEC_TYPE(VEC_SIZE)

1334

res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));

1335

VEC_TYPE(VEC_SIZE)

1336

res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));

1337

VEC_TYPE(VEC_SIZE)

1338

res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE));

1339

VEC_TYPE(VEC_SIZE)

1340

res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1341

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1342

#if defined(DST_DEPTH)

1343

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;

1344

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1345

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1346

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1347

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1348

VSTORE(VEC_SIZE)

Georgios Pinitas

60e9825

2018-10-22 16:17:20 +0100

[diff] [blame]

1349

(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1350

VSTORE(VEC_SIZE)

Georgios Pinitas

60e9825

2018-10-22 16:17:20 +0100

[diff] [blame]

1351

(ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1352

1353

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1354

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

1355

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1356

{

1357

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1358

(ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1359

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1360

(ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1361

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1362

}

1363

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1364

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4

1365

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1366

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1367

* @note Per-channel quantization is not supported by this kernel.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1368

* @note This kernel assumes VEC_SIZE is 4.

1369

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1370

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1371

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1372

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1373

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1374

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1375

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1376

* @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.

1377

* If not, the quantization will be performed using a fixed point multiplication

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1378

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

1379

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1380

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1381

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1382

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1383

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1384

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1385

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1386

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1387

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1388

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1389

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1390

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1391

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1392

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1393

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1394

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1395

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1396

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1397

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1398

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1399

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1400

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1401

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1402

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1403

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1404

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1405

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1406

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1407

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1408

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1409

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1410

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1411

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1412

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1413

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1414

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1415

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1416

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1417

* @param[in] max_offset The maximum allowed offset for the input tensor

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1418

*/

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1419

__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1420

TENSOR4D_DECLARATION(src),

1421

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1422

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1423

VECTOR_DECLARATION(output_multipliers),

1424

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1425

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1426

VECTOR_DECLARATION(biases),

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1427

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1428

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1429

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1430

int x = get_global_id(0);

1431

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1432

#if defined(DST_DEPTH)

1433

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1434

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1435

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1436

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1437

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1438

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1439

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1440

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1441

#if defined(DST_DEPTH)

1442

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

1443

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1444

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1445

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1446

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1447

int z_coord = 0;

1448

int4 offset = 0;

1449

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1450

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1451

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

1452

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

1453

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

1454

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

1455

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

1456

1457

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

1458

1459

// We compute 4x2x1 [C,W,H] elements

VEC_INT acc0 = 0;

VEC_INT acc1 = 0;

VEC_INT sum0 = 0;

VEC_INT sum1 = 0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1464

1465

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1466

VEC_TYPE(16)

1467

w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

1468

VEC_TYPE(16)

1469

w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

1470

VEC_TYPE(4)

1471

w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1472

1473

#if INPUT_OFFSET != 0

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1474

// Initilize the final result with the weights reduction multiplied by INPUT_OFFSET

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1475

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1476

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1477

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1478

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1479

1480

// Multiply the weights reduction with INPUT_OFFSET

1481

acc0 = INPUT_OFFSET * acc0;

1482

1483

acc1 = acc0;

1484

#endif // INPUT_OFFSET != 0

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1489

// z_coord is casted to unsigned int in order to use just a min() operation

1490

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1491

z_coord = z - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1492

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1493

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1494

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1495

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1496

VEC_TYPE(VEC_SIZE)

1497

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1498

VEC_TYPE(VEC_SIZE)

1499

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1500

VEC_TYPE(VEC_SIZE)

1501

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1502

VEC_TYPE(VEC_SIZE)

1503

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1504

1505

// z == 1

1506

// z_coord can be only negative for z = 0 so we do not need to clamp it

1507

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1508

z_coord = z - (int)CONV_PAD_TOP + 1;

1509

offset = y_offset + (int4)(z_coord * src_stride_z);

1510

VEC_TYPE(VEC_SIZE)

1511

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1512

VEC_TYPE(VEC_SIZE)

1513

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1514

VEC_TYPE(VEC_SIZE)

1515

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1516

VEC_TYPE(VEC_SIZE)

1517

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1518

1519

// z == 2

1520

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1521

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1522

offset += (int4)src_stride_z;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1523

offset = min(offset, (int4)max_offset);

1524

VEC_TYPE(VEC_SIZE)

1525

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1526

VEC_TYPE(VEC_SIZE)

1527

values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1528

VEC_TYPE(VEC_SIZE)

1529

values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1530

VEC_TYPE(VEC_SIZE)

1531

values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1532

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1533

DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);

1534

DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);

1535

DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);

1536

DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1537

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1538

DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);

1539

DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1540

DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

1541

DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1542

1543

DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);

1544

DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);

1545

DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);

1546

DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);

1547

1548

DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);

1549

DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1550

DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

1551

DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1552

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1553

#if defined(HAS_BIAS)

1554

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1555

1556

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1557

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1558

acc0 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1559

acc1 += bias_values;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1560

1561

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1562

1563

#if WEIGHTS_OFFSET != 0

1564

acc0 += WEIGHTS_OFFSET * sum0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1565

acc1 += WEIGHTS_OFFSET * sum1;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1566

#endif // WEIGHTS_OFFSET != 0

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1567

1568

#if K_OFFSET != 0

1569

acc0 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1570

acc1 += (VEC_INT)K_OFFSET;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1571

1572

#endif // K_OFFSET != 0

1573

1574

#if defined(REAL_MULTIPLIER)

1575

1576

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1577

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1578

1579

#else // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1580

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1581

#if OUTPUT_SHIFT < 0

1582

acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

1583

acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);

1584

#else // OUTPUT_SHIFT < 0

1585

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1586

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);

1587

#endif // OUTPUT_SHIFT < 0

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1588

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1589

#endif // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1590

acc0 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1591

acc1 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1592

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1593

VEC_TYPE(VEC_SIZE)

1594

res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));

1595

VEC_TYPE(VEC_SIZE)

1596

res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1597

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1598

#if defined(DST_DEPTH)

1599

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1600

#else /* defined(DST_DEPTH) */

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1601

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1602

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1603

1604

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1605

(ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1606

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1607

(ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1608

}

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1609

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1610

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1611

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1612

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1613

#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

1614

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1615

#endif // defined(WEIGHTS_PROMOTED_TYPE)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1616

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1617

#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1618

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1619

#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1620

/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped

1621

*

1622

* @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)

1623

* @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)

1624

* @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)

1625

* @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)

1626

* @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)

1627

* @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)

1628

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1629

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1630

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

1631

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1632

* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1633

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1634

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1635

*

Michele Di Giorgio

4cd4cde

2020-01-06 14:07:44 +0000

[diff] [blame]

1636

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1637

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1638

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1639

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1640

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1641

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1642

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1643

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1644

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1645

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1646

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1647

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1648

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1649

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1650

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1651

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1652

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1653

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1654

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1655

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

4cd4cde

2020-01-06 14:07:44 +0000

[diff] [blame]

1656

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1657

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1658

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1659

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1660

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1661

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1662

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1663

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1664

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1665

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1666

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1667

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1668

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1669

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1670

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1671

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1672

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1673

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1674

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1675

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1676

*/

1677

__kernel void dwc_MxN_native_quantized8_nhwc(

1678

TENSOR4D_DECLARATION(src),

1679

TENSOR4D_DECLARATION(dst),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1680

TENSOR3D_DECLARATION(weights),

1681

VECTOR_DECLARATION(output_multipliers),

1682

VECTOR_DECLARATION(output_shifts)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1683

#if defined(HAS_BIAS)

Michele Di Giorgio

1dce310

2019-10-22 10:29:03 +0100

[diff] [blame]

1684

,

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1685

VECTOR_DECLARATION(biases)

1686

#endif // defined(HAS_BIAS)

1687

)

1688

{

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1689

int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);

1690

int y = get_global_id(1); // spatial coordinate x

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1691

#if defined(DST_DEPTH)

1692

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1693

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1694

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1695

int z = get_global_id(2); // spatial coordinate y

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1696

#endif // defined(DST_DEPTH)

1697

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1698

__global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1699

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1700

__global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1701

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1702

__global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1703

1704

#if defined(HAS_BIAS)

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1705

__global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1706

#endif // defined(HAS_BIAS)

1707

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1708

#if defined(PER_CHANNEL_QUANTIZATION)

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1709

__global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;

1710

__global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1711

#endif // defined(PER_CHANNEL_QUANTIZATION)

1712

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1713

#if defined(DST_DEPTH)

1714

s_addr += b * src_stride_w;

1715

d_addr += b * dst_stride_w;

1716

#endif // defined(DST_DEPTH)

1717

1718

#if DEPTH_MULTIPLIER > 1

1719

for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)

1720

{

1721

#endif // DEPTH_MULTIPLIER > 1

1722

// Each work-item computes N0x1x1 elements

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1723

VEC_INT res = 0;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1724

1725

int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;

1726

int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;

1727

1728

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

1729

{

1730

if(y_coord >= 0 && y_coord < SRC_DIM2)

1731

{

1732

int x_coord_tmp = x_coord;

1733

1734

for(int xk = 0; xk < KERNEL_WIDTH; ++xk)

1735

{

1736

if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)

1737

{

1738

int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;

1739

int w_offset = xk * weights_stride_y + yk * weights_stride_z;

1740

1741

// Load input and weights values

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1742

VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);

1743

VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1744

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1745

res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1746

}

1747

x_coord_tmp += DILATION_X;

1748

}

1749

}

1750

y_coord += DILATION_Y;

1751

}

1752

1753

#if defined(HAS_BIAS)

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1754

VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1755

res += bias;

1756

#endif // defined(HAS_BIAS)

1757

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1758

#if defined(PER_CHANNEL_QUANTIZATION)

Michele Di Giorgio

2019-10-23 10:53:10 +0100

[diff] [blame]

1759

VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));

1760

VEC_INT output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr));

1761

1762

VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);

1763

VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);

1764

res = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);

1765

#else // defined(PER_CHANNEL_QUANTIZATION)

1766

#if OUTPUT_SHIFT < 0

1767

res = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);

1768

#else // OUTPUT_SHIFT < 0

1769

res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);

1770

#endif // OUTPUT_OFFSET < 0

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1771

#endif // defined(PER_CHANNEL_QUANTIZATION)

1772

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1773

res += (VEC_INT)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1774

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1775

VEC_TYPE(VEC_SIZE)

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1776

res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));

1777

res0 = ACTIVATION_FUNC(res0);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1778

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1779

STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1780

1781

#if DEPTH_MULTIPLIER > 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1782

w_addr += sizeof(WEIGHTS_TYPE);

1783

d_addr += sizeof(DATA_TYPE);

1784

#if defined(PER_CHANNEL_QUANTIZATION)

1785

out_mul_addr += sizeof(int);

1786

out_shift_addr += sizeof(int);

1787

#endif // defined(PER_CHANNEL_QUANTIZATION)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1788

#if defined(HAS_BIAS)

1789

b_addr += sizeof(int);

1790

#endif // defined(HAS_BIAS)

1791

}

1792

#endif // DEPTH_MULTIPLIER > 1

1793

}

Giorgio Arena

2020-10-22 14:29:50 +0100

[diff] [blame]

1794

#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)

Michele Di Giorgio