Blame - src/core/CL/cl_kernels/direct_convolution5x5.cl - ml/ComputeLibrary

2017-08-09 16:26:22 +0100

[diff] [blame]

1

/*

Michele Di Giorgio

d9eaf61

2020-07-08 11:12:57 +0100

[diff] [blame]

2

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

#undef CONVERT_SAT

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

28

#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

29

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

30

#if STRIDE_X == 1

31

#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)

32

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

33

#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)

34

#else /* STRIDE_X not equals 1 or 2 */

35

#error "STRIDE_X larger than 2 is not supported"

36

#endif /* STRIDE_X == 2 */

37

38

#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

39

({ \

40

VEC_DATA_TYPE(DATA_TYPE, 4) \

41

weights_values0 = vload4(0, weights_row_ptr); \

42

DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \

43

VEC_DATA_TYPE(DATA_TYPE, 8) \

44

src0 = vload8(0, src_row_ptr); \

45

VEC_DATA_TYPE(DATA_TYPE, 4) \

46

src1 = vload4(0, src_row_ptr + 8); \

47

\

48

acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \

49

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \

50

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \

51

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \

52

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \

53

})

54

55

#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

56

({ \

57

VEC_DATA_TYPE(DATA_TYPE, 4) \

58

weights_values0 = vload4(0, weights_row_ptr); \

59

DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \

60

VEC_DATA_TYPE(DATA_TYPE, 16) \

61

src0 = vload16(0, src_row_ptr); \

62

VEC_DATA_TYPE(DATA_TYPE, 4) \

63

src1 = vload4(0, src_row_ptr + 16); \

64

acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \

65

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \

66

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \

67

\

68

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \

69

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \

70

})

71

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

72

#if defined(DATA_LAYOUT_NHWC)

73

74

#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))

75

76

#if STRIDE_X == 1

77

#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)

78

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

79

#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)

80

#else /* STRIDE_X not equals 1 or 2 */

81

#error "STRIDE_X larger than 2 is not supported"

82

#endif /* STRIDE_X == 2 */

83

84

#define CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr) \

85

({ \

86

VEC_DATA_TYPE(DATA_TYPE, 8) \

87

src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))( \

88

PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE), \

89

PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE), \

90

PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE), \

91

PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE)); \

92

VEC_DATA_TYPE(DATA_TYPE, 4) \

93

src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \

94

PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE), \

95

PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE)); \

96

VEC_DATA_TYPE(DATA_TYPE, 4) \

97

weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \

98

PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE), \

99

PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \

100

DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE); \

101

acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \

102

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \

103

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \

104

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \

105

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \

106

})

107

108

#define CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr) \

109

({ \

110

VEC_DATA_TYPE(DATA_TYPE, 16) \

111

src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))( \

112

PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE), \

113

PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE), \

114

PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE), \

115

PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE), \

116

PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE), \

117

PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE), \

118

PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE), \

119

PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE)); \

120

VEC_DATA_TYPE(DATA_TYPE, 4) \

121

src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \

122

PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE), \

123

PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE)); \

124

VEC_DATA_TYPE(DATA_TYPE, 4) \

125

weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))( \

126

PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE), \

127

PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \

128

DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE); \

129

acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \

130

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \

131

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \

132

\

133

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \

134

acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \

135

})

136

137

/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout

138

*

139

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

140

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

141

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

142

*

143

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

144

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

145

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

146

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

147

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

148

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

149

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

150

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

151

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

152

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

153

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

154

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

155

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

156

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

157

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

158

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

159

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

160

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

161

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

162

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

163

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

164

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

165

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

166

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

167

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

168

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

169

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

170

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

171

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

172

*/

173

__kernel void direct_convolution5x5_nhwc(

174

TENSOR3D_DECLARATION(src),

175

TENSOR3D_DECLARATION(dst),

176

TENSOR3D_DECLARATION(weights),

177

#ifdef HAS_BIAS

178

VECTOR_DECLARATION(biases),

179

#endif /* defined(HAS_BIAS) */

180

unsigned int weights_stride_w)

181

{

182

Image src = CONVERT_TO_IMAGE_STRUCT(src);

183

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

184

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

185

186

VEC_DATA_TYPE(DATA_TYPE, 8)

187

values0 = 0;

188

189

const int id0 = get_global_id(0);

190

const int id1 = get_global_id(1);

191

const int id2 = get_global_id(2);

192

193

__global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);

194

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;

195

196

weights_addr += id0 * weights_stride_w;

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

197

Pablo Tello

d041a83

2018-10-03 17:11:09 +0100

[diff] [blame]

198

#if(PAD_TOP == 1)

199

const int coordy = id2 - PAD_TOP;

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

200

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

201

{

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

202

if(coordy < 0) // special case Z = -1 doesn't exists

203

{

204

//skip first row and load the two next ones

205

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

206

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

207

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

208

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

209

}

210

else if(coordy == (DST_HEIGHT - PAD_TOP - 1))

211

{

212

// special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the

213

// Z axis has no padding at all.

214

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

215

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

216

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

217

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

}

else

{

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

222

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

223

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

224

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

225

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

226

}

Pablo Tello

d041a83

2018-10-03 17:11:09 +0100

[diff] [blame]

227

src_addr += src_stride_x;

228

weights_addr += weights_stride_x;

229

}

230

#elif(PAD_TOP == 2)

231

const int coordy = id2 * STRIDE_Y;

232

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

233

{

234

if(coordy == 0) // special case Z = -2 doesn't exists

235

{

236

//skip first row and load the two next ones

237

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

238

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

239

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

240

}

241

else if(coordy == 1) // special case Z = -1 doesn't exists

242

{

243

//skip first row and load the two next ones

244

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

245

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

246

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

247

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

248

}

249

else if(coordy == (SRC_HEIGHT - 1))

250

{

251

// special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the

252

// Z axis has no padding at all.

253

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

254

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

255

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

256

}

257

else if(coordy == (SRC_HEIGHT - 2))

258

{

259

// special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the

260

// Z axis has no padding at all.

261

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

262

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

263

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

264

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

}

else

{

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

269

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

270

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

271

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

272

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

273

}

274

src_addr += src_stride_x;

275

weights_addr += weights_stride_x;

276

}

277

278

#else /* PAD_TOP == 2 */

279

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

280

{

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

281

CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);

282

CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));

283

CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));

284

CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));

285

CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

286

src_addr += src_stride_x;

287

weights_addr += weights_stride_x;

288

}

Pablo Tello

d041a83

2018-10-03 17:11:09 +0100

[diff] [blame]

289

#endif /* PAD_TOP == 1 */

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

290

291

#ifdef HAS_BIAS

292

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

293

values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));

294

#endif /* defined(HAS_BIAS) */

295

296

*((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;

297

*((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;

298

*((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;

299

*((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;

300

*((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;

301

*((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;

302

*((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;

303

*((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;

304

}

305

306

#endif // defined(DATA_LAYOUT_NHWC)

307

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

308

/** This kernel performs a direct convolution to convolve the low three dimensions.

309

*

310

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

311

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

312

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

313

*

314

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

315

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

316

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

317

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

318

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

319

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

320

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

321

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

322

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

323

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

324

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

325

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

326

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

327

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

328

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

329

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

330

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

331

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

332

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

333

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

334

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

335

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

336

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

337

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

338

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

339

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

340

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

341

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

342

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

343

*/

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

344

__kernel void direct_convolution5x5(

345

TENSOR3D_DECLARATION(src),

346

TENSOR3D_DECLARATION(dst),

347

TENSOR3D_DECLARATION(weights),

348

#ifdef HAS_BIAS

349

VECTOR_DECLARATION(biases),

350

#endif /* defined(HAS_BIAS) */

351

unsigned int weights_stride_w)

352

{

353

Image src = CONVERT_TO_IMAGE_STRUCT(src);

354

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

355

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

356

357

VEC_DATA_TYPE(DATA_TYPE, 8)

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

358

values0 = 0;

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

359

360

__global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);

361

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

362

363

const int kernel_index = get_global_id(2);

364

weights_addr += kernel_index * weights_stride_w;

365

Gian Marco Iodice

744b5ed

2017-10-06 15:44:27 +0100

[diff] [blame]

366

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

367

{

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

368

CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);

369

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

370

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

371

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));

372

CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

373

374

src_addr += src_stride_z;

375

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

380

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

381

values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

382

#endif /* defined(HAS_BIAS) */

383

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

384

vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

385

}

386

#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

387

388

#if defined(WEIGHTS_DEPTH)

389

390

#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \

391

({ \

392

acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \

393

acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \

394

acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \

395

acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \

396

acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \

397

acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \

398

acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \

399

acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \

400

acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \

401

acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \

402

acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \

403

acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \

404

acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \

405

acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \

406

acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \

407

acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \

408

acc.s0 = mad(src0.s4, weights_row01, acc.s0); \

409

acc.s1 = mad(src0.s5, weights_row01, acc.s1); \

410

acc.s2 = mad(src0.s6, weights_row01, acc.s2); \

411

acc.s3 = mad(src0.s7, weights_row01, acc.s3); \

412

})

413

414

/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32

415

*

416

* @note This OpenCL kernel works only with stride_x and stride_y equal to 1

417

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

418

* @note If biases are used then -DHAS_BIAS has to be passed at compile time

419

*

420

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

421

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

422

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

423

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

424

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

425

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

426

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

427

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

428

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

429

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

430

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

431

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

432

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

433

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

434

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

435

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

436

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

437

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

438

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

439

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

440

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

441

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

442

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

443

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

444

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

445

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

446

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

447

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

448

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

449

*/

450

__kernel void direct_convolution5x5_f32_bifrost(

451

TENSOR3D_DECLARATION(src),

452

TENSOR3D_DECLARATION(dst),

453

TENSOR3D_DECLARATION(weights),

454

#ifdef HAS_BIAS

455

VECTOR_DECLARATION(biases),

456

#endif /* defined(HAS_BIAS) */

457

unsigned int weights_stride_w)

458

{

459

// Get the kernel index

460

const int kernel_index = get_global_id(2);

461

462

Image src = CONVERT_TO_IMAGE_STRUCT(src);

463

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

464

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

465

float4 values0 = 0.0f;

466

float4 values1 = 0.0f;

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

467

468

__global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);

469

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

470

471

// Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor

472

473

for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)

474

{

475

// Load the weights from row0 and row1

476

float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));

477

float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);

478

float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));

479

float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);

480

float8 src0;

481

482

// Load values from row0 of input tensor

483

src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));

484

485

// Accumulate

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

486

CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

487

488

// Load values from row1 of input tensor

489

src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));

490

491

// Accumulate

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

492

CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);

493

CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

494

495

// Load values from row2 of input tensor

496

src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));

497

498

// Load weights from row2

499

weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));

500

weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);

501

502

// Accumulate

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

503

CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);

504

CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

505

506

// Load values from row3 of input tensor

507

src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));

508

509

// Load weights from row3

510

weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));

511

weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);

512

513

// Accumulate

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

514

CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);

515

CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

516

517

// Load values from row4 of input tensor

518

src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));

519

520

// Load weights from row4

521

weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));

522

weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);

523

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

524

CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);

525

CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

526

527

// Load values from row5 of input tensor

528

src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));

529

530

// Accumulate

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

531

CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

532

533

src_addr += src_stride_z;

534

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

539

540

float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));

541

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

542

values0 += bias;

543

values1 += bias;

Gian Marco Iodice

2017-08-16 18:38:32 +0100

[diff] [blame]

544

#endif /* defined(HAS_BIAS) */

545

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

546

vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

547

vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

Gian Marco Iodice