Blame - src/core/CL/cl_kernels/direct_convolution3x3.cl - ml/ComputeLibrary

2017-08-16 18:38:32 +0100

[diff] [blame]

43

#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

44

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

45

#if STRIDE_X == 1

46

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)

47

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

48

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

49

#else /* STRIDE_X not equals 1 or 2 */

50

#error "STRIDE_X larger than 2 is not supported"

51

#endif /* STRIDE_X == 2 */

52

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

53

#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

54

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

55

VEC_DATA_TYPE(DATA_TYPE, 3) \

56

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

57

VEC_DATA_TYPE(DATA_TYPE, 8) \

58

src0 = vload8(0, src_row_ptr); \

59

VEC_DATA_TYPE(DATA_TYPE, 2) \

60

src1 = vload2(0, src_row_ptr + 8); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

61

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

62

acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

63

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

64

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

65

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

66

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

67

#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

68

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

69

VEC_DATA_TYPE(DATA_TYPE, 3) \

70

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

71

VEC_DATA_TYPE(DATA_TYPE, 16) \

72

src0 = vload16(0, src_row_ptr); \

73

DATA_TYPE src1 = *(src_row_ptr + 16); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

74

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

75

acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

76

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

77

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

78

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

79

80

/** This kernel performs a direct convolution to convolve the low three dimensions.

81

*

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

82

* @note This OpenCL kernel works with stride_x = 1 and 2

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

83

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

84

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

85

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

86

*

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

87

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

88

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

89

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

90

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

91

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

92

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

93

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

94

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

95

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

96

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

97

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

98

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

99

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

100

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

101

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

102

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

103

* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr

104

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

105

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

106

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

107

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

108

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

109

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

110

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

111

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

112

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

113

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

114

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

115

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

116

*/

117

__kernel void direct_convolution3x3(

118

TENSOR3D_DECLARATION(src),

119

TENSOR3D_DECLARATION(dst),

120

TENSOR3D_DECLARATION(weights),

121

#ifdef HAS_BIAS

122

VECTOR_DECLARATION(biases),

123

#endif /* defined(HAS_BIAS) */

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

124

unsigned int weights_stride_w)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

125

{

126

Image src = CONVERT_TO_IMAGE_STRUCT(src);

127

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

128

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

129

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

130

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

131

pixels0 = 0;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

132

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

133

__global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);

134

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

135

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

136

const int kernel_index = get_global_id(2);

137

weights_addr += kernel_index * weights_stride_w;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

138

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

139

for(int d = 0; d < WEIGHTS_DEPTH; ++d)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

140

{

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

141

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));

142

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

143

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

144

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

145

src_addr += src_stride_z;

146

weights_addr += weights_stride_z;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

147

}

148

149

#ifdef HAS_BIAS

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

150

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

151

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

152

pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

153

#endif /* defined(HAS_BIAS) */

154

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

155

vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

156

}

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

157

#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

158

159

#if defined(WEIGHTS_DEPTH)

160

161

#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \

162

({ \

163

acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \

164

acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \

165

acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \

166

acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \

167

acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \

168

acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \

169

acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \

170

acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \

171

acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \

172

acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \

173

acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \

174

acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \

175

})

176

177

/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32

178

*

179

* @note This OpenCL kernel works only with stride_x and stride_y equal to 1

180

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

181

* @note In case biases, -DHAS_BIAS must to be passed at compile

182

*

183

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

184

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

185

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

186

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

187

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

188

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

189

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

190

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

191

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

192

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

193

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

194

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

195

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

196

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

197

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

198

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

199

* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr

200

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

201

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

202

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

203

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

204

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

205

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

206

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

207

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

208

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

209

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

210

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

211

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

212

*/

213

__kernel void direct_convolution3x3_f32_bifrost(

214

TENSOR3D_DECLARATION(src),

215

TENSOR3D_DECLARATION(dst),

216

TENSOR3D_DECLARATION(weights),

217

#ifdef HAS_BIAS

218

VECTOR_DECLARATION(biases),

219

#endif /* defined(HAS_BIAS) */

220

unsigned int weights_stride_w)

221

{

222

// Get the kernel index

223

const int kernel_index = get_global_id(2);

224

225

Image src = CONVERT_TO_IMAGE_STRUCT(src);

226

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

float4 pixels0 = 0;

float4 pixels1 = 0;

float4 pixels2 = 0;

__global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);

233

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

234

235

// Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor

236

237

for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)

238

{

239

// Load the weights

240

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

241

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

242

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

float4 src0;

float2 src1;

// Load values from row0 of input tensor

247

src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));

248

src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);

249

250

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row0);

251

252

// Load values from row1 of input tensor

253

src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));

254

src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);

255

256

// Accumulate

257

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row1);

258

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row0);

259

260

// Load values from row2 of input tensor

261

src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));

262

src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);

263

264

// Accumulate

265

CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row2);

266

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row1);

267

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row0);

268

269

// Load values from row3 of input tensor

270

src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));

271

src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);

272

273

// Accumulate

274

CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row2);

275

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row1);

276

277

// Row4

278

src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));

279

src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);

280

281

// Accumulate

282

CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row2);

283

284

src_addr += src_stride_z;

285

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

290

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame^]

291

float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

292

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame^]

293

pixels0 += (float4)bias;

294

pixels1 += (float4)bias;

295

pixels2 += (float4)bias;

Gian Marco Iodice