Blame - src/core/CL/cl_kernels/direct_convolution3x3.cl - ml/ComputeLibrary

2017-08-14 11:26:37 +0100

[diff] [blame]

39

#define ADD_OP(a, b) ((a) + (b))

40

#define MUL_OP(a, b) ((a) * (b))

41

#define CONVERT_SAT(a, b) ((a))

42

43

#endif /* FIXED_POINT_POSITION */

44

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

45

#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

46

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

47

#if STRIDE_X == 1

48

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)

49

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

50

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

51

#else /* STRIDE_X not equals 1 or 2 */

52

#error "STRIDE_X larger than 2 is not supported"

53

#endif /* STRIDE_X == 2 */

54

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

55

#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

56

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

57

VEC_DATA_TYPE(DATA_TYPE, 3) \

58

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

59

VEC_DATA_TYPE(DATA_TYPE, 8) \

60

src0 = vload8(0, src_row_ptr); \

61

VEC_DATA_TYPE(DATA_TYPE, 2) \

62

src1 = vload2(0, src_row_ptr + 8); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

63

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

64

acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

65

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

66

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

67

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

68

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

69

#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

70

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

71

VEC_DATA_TYPE(DATA_TYPE, 3) \

72

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

73

VEC_DATA_TYPE(DATA_TYPE, 16) \

74

src0 = vload16(0, src_row_ptr); \

75

DATA_TYPE src1 = *(src_row_ptr + 16); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

76

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

77

acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

78

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

79

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

80

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

81

82

/** This kernel performs a direct convolution to convolve the low three dimensions.

83

*

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

84

* @note This OpenCL kernel works with stride_x = 1 and 2

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

85

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

86

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

87

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

88

*

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

89

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

90

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

91

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

92

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

93

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

94

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

95

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

96

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

97

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

98

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

99

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

100

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

101

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

102

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

103

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

104

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

105

* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr

106

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

107

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

108

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

109

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

110

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

111

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

112

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

113

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

114

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

115

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

116

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

117

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

118

*/

119

__kernel void direct_convolution3x3(

120

TENSOR3D_DECLARATION(src),

121

TENSOR3D_DECLARATION(dst),

122

TENSOR3D_DECLARATION(weights),

123

#ifdef HAS_BIAS

124

VECTOR_DECLARATION(biases),

125

#endif /* defined(HAS_BIAS) */

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

126

unsigned int weights_stride_w)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

127

{

128

Image src = CONVERT_TO_IMAGE_STRUCT(src);

129

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

130

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

131

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

132

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

133

pixels0 = 0;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

134

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

135

__global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);

136

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

137

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

138

const int kernel_index = get_global_id(2);

139

weights_addr += kernel_index * weights_stride_w;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

140

Gian Marco Iodice

744b5ed

2017-10-06 15:44:27 +0100

[diff] [blame^]

141

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

142

{

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

143

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));

144

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

145

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

146

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

147

src_addr += src_stride_z;

148

weights_addr += weights_stride_z;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

149

}

150

151

#ifdef HAS_BIAS

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

152

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

153

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

154

pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

155

#endif /* defined(HAS_BIAS) */

156

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

157

vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

158

}

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

159

#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

160

161

#if defined(WEIGHTS_DEPTH)

162

163

#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \

164

({ \

165

acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \

166

acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \

167

acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \

168

acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \

169

acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \

170

acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \

171

acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \

172

acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \

173

acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \

174

acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \

175

acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \

176

acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \

177

})

178

179

/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32

180

*

181

* @note This OpenCL kernel works only with stride_x and stride_y equal to 1

182

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

183

* @note In case biases, -DHAS_BIAS must to be passed at compile

184

*

185

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

186

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

187

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

188

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

189

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

190

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

191

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

192

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

193

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

194

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

195

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

196

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

197

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

198

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

199

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

200

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

201

* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr

202

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

203

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

204

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

205

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

206

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

207

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

208

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

209

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

210

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

211

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

212

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

213

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

214

*/

215

__kernel void direct_convolution3x3_f32_bifrost(

216

TENSOR3D_DECLARATION(src),

217

TENSOR3D_DECLARATION(dst),

218

TENSOR3D_DECLARATION(weights),

219

#ifdef HAS_BIAS

220

VECTOR_DECLARATION(biases),

221

#endif /* defined(HAS_BIAS) */

222

unsigned int weights_stride_w)

223

{

224

// Get the kernel index

225

const int kernel_index = get_global_id(2);

226

227

Image src = CONVERT_TO_IMAGE_STRUCT(src);

228

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

float4 pixels0 = 0;

float4 pixels1 = 0;

float4 pixels2 = 0;

__global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);

235

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

236

237

// Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor

238

239

for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)

240

{

241

// Load the weights

242

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

243

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

244

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

float4 src0;

float2 src1;

// Load values from row0 of input tensor

249

src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));

250

src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);

251

252

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row0);

253

254

// Load values from row1 of input tensor

255

src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));

256

src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);

257

258

// Accumulate

259

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row1);

260

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row0);

261

262

// Load values from row2 of input tensor

263

src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));

264

src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);

265

266

// Accumulate

267

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row2);

268

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row1);

269

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row0);

270

271

// Load values from row3 of input tensor

272

src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));

273

src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);

274

275

// Accumulate

276

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row2);

277

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row1);

278

279

// Row4

280

src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));

281

src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);

282

283

// Accumulate

284

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row2);

285

286

src_addr += src_stride_z;

287

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

292

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

293

float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

294

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

295

pixels0 += (float4)bias;

296

pixels1 += (float4)bias;

297

pixels2 += (float4)bias;

Gian Marco Iodice