Blame - src/core/CL/cl_kernels/pooling_layer.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

26

#ifdef FIXED_POINT_POSITION

27

28

#include "fixed_point.h"

29

30

#if defined(POOL_AVG)

31

#define POOL_OP(x, y) add_sat(x, y)

32

#else /* POOL_AVG */

33

#define POOL_OP(x, y) (max((x), (y)))

34

#endif /* POOL_AVG */

35

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

36

#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

37

#define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

38

#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))

39

40

#if defined(POOL_L2)

41

#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)

42

#else /* defined(POOL_L2) */

43

#define POW2_OP(x, vec_size) (x)

44

#endif /* defined(POOL_L2) */

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

45

46

#else /* FIXED_POINT_POSITION */

47

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

48

#if defined(POOL_AVG) || defined(POOL_L2)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

49

#define POOL_OP(x, y) ((x) + (y))

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

50

#else /* defined(POOL_AVG) || defined(POOL_L2) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

51

#define POOL_OP(x, y) (fmax((x), (y)))

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

52

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

53

54

#if defined(POOL_L2)

55

#define POW2_OP(x, vec_size) ((x) * (x))

56

#else /* defined(POOL_L2) */

57

#define POW2_OP(x, vec_size) (x)

58

#endif /* defined(POOL_L2) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

59

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

60

#define DIV_OP(x, y) (x * (1.f / y))

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

61

#define SQRT_OP(x) sqrt((x))

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

62

63

#endif /* FIXED_POINT_POSITION */

64

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

65

#if STRIDE_X == 1

66

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)

67

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

68

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)

69

#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */

70

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)

71

#endif /* STRIDE_X == 3 */

72

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

73

#define POOLING3x3_STRIDE1(res, input, output) \

74

({ \

75

VEC_DATA_TYPE(DATA_TYPE, 4) \

76

data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

77

VEC_DATA_TYPE(DATA_TYPE, 2) \

78

data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \

79

VEC_DATA_TYPE(DATA_TYPE, 4) \

80

data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

81

VEC_DATA_TYPE(DATA_TYPE, 2) \

82

data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \

83

VEC_DATA_TYPE(DATA_TYPE, 4) \

84

data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

85

VEC_DATA_TYPE(DATA_TYPE, 2) \

86

data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

87

data00 = POW2_OP(data00, 4); \

88

data01 = POW2_OP(data01, 2); \

89

data10 = POW2_OP(data10, 4); \

90

data11 = POW2_OP(data11, 2); \

91

data20 = POW2_OP(data20, 4); \

92

data21 = POW2_OP(data21, 2); \

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

93

\

94

VEC_DATA_TYPE(DATA_TYPE, 8) \

95

values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \

96

VEC_DATA_TYPE(DATA_TYPE, 4) \

97

values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \

98

VEC_DATA_TYPE(DATA_TYPE, 8) \

99

values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \

100

VEC_DATA_TYPE(DATA_TYPE, 4) \

101

values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \

102

VEC_DATA_TYPE(DATA_TYPE, 8) \

103

values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \

104

VEC_DATA_TYPE(DATA_TYPE, 4) \

105

values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \

106

\

107

values00 = POOL_OP(values00, values10); \

108

values01 = POOL_OP(values01, values11); \

109

values00 = POOL_OP(values00, values20); \

110

values01 = POOL_OP(values01, values21); \

111

\

112

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \

113

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \

114

})

115

116

#define POOLING3x3_STRIDE2(res, input, output) \

117

({ \

118

VEC_DATA_TYPE(DATA_TYPE, 8) \

119

data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

120

DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \

121

VEC_DATA_TYPE(DATA_TYPE, 8) \

122

data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

123

DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \

124

VEC_DATA_TYPE(DATA_TYPE, 8) \

125

data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

126

DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

127

data00 = POW2_OP(data00, 8); \

128

data01 = POW2_OP(data01, 1); \

129

data10 = POW2_OP(data10, 8); \

130

data11 = POW2_OP(data11, 1); \

131

data20 = POW2_OP(data20, 8); \

132

data21 = POW2_OP(data21, 1); \

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

133

\

134

VEC_DATA_TYPE(DATA_TYPE, 8) \

135

values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \

136

VEC_DATA_TYPE(DATA_TYPE, 4) \

137

values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \

138

VEC_DATA_TYPE(DATA_TYPE, 8) \

139

values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \

140

VEC_DATA_TYPE(DATA_TYPE, 4) \

141

values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \

142

VEC_DATA_TYPE(DATA_TYPE, 8) \

143

values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \

144

VEC_DATA_TYPE(DATA_TYPE, 4) \

145

values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \

146

\

147

values00 = POOL_OP(values00, values10); \

148

values01 = POOL_OP(values01, values11); \

149

values00 = POOL_OP(values00, values20); \

150

values01 = POOL_OP(values01, values21); \

151

\

152

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \

153

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \

154

})

155

156

#define POOLING3x3_STRIDE3(res, input, output) \

157

({ \

158

VEC_DATA_TYPE(DATA_TYPE, 8) \

159

data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

160

VEC_DATA_TYPE(DATA_TYPE, 4) \

161

data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \

162

VEC_DATA_TYPE(DATA_TYPE, 8) \

163

data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

164

VEC_DATA_TYPE(DATA_TYPE, 4) \

165

data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \

166

VEC_DATA_TYPE(DATA_TYPE, 8) \

167

data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

168

VEC_DATA_TYPE(DATA_TYPE, 4) \

169

data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

170

data00 = POW2_OP(data00, 8); \

171

data01 = POW2_OP(data01, 4); \

172

data10 = POW2_OP(data10, 8); \

173

data11 = POW2_OP(data11, 4); \

174

data20 = POW2_OP(data20, 8); \

175

data21 = POW2_OP(data21, 4); \

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

176

\

177

data00 = POOL_OP(data00, data10); \

178

data01 = POOL_OP(data01, data11); \

179

data00 = POOL_OP(data00, data20); \

180

data01 = POOL_OP(data01, data21); \

181

\

182

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \

183

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \

184

})

185

186

DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,

187

const int pad_x, const int pad_y, const int stride_x, const int stride_y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

188

{

Georgios Pinitas

2017-10-30 15:56:32 +0000

[diff] [blame^]

189

int start_x = get_global_id(0) * stride_x - pad_x;

190

int start_y = get_global_id(1) * stride_y - pad_y;

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

191

const int end_x = min(start_x + pool_size, upper_bound_w);

192

const int end_y = min(start_y + pool_size, upper_bound_h);

Georgios Pinitas

2017-10-30 15:56:32 +0000

[diff] [blame^]

193

#if defined(EXCLUDE_PADDING)

194

start_x = max(0, start_x);

195

start_y = max(0, start_y);

196

#endif /* defined(EXCLUDE_PADDING) */

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

197

return ((end_y - start_y) * (end_x - start_x));

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

198

}

199

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

200

/** Performs a pooling function of pool size equal to 2.

201

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

202

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

203

* @note In case of average pooling the following information must be passed at compile time:

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

204

* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

205

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

206

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

207

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

208

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

209

* @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

210

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

211

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

212

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

213

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

214

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

215

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

216

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

217

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

218

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

219

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

220

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

221

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

222

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

223

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

224

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

225

*/

226

__kernel void pooling_layer_2(

227

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

228

TENSOR3D_DECLARATION(output))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

229

{

230

// Get pixels pointer

231

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

232

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

233

234

// Load data

235

VEC_DATA_TYPE(DATA_TYPE, 2)

236

data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

237

VEC_DATA_TYPE(DATA_TYPE, 2)

238

data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

239

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

240

#if defined(POOL_L2)

241

// Raise to power of 2 for L2 Pooling

242

data0 = POW2_OP(data0, 2);

243

data1 = POW2_OP(data1, 2);

244

#endif /* defined(POOL_L2) */

245

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

246

// Perform calculations

247

data0 = POOL_OP(data0, data1);

248

DATA_TYPE res = POOL_OP(data0.s0, data0.s1);

249

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

250

#if defined(POOL_AVG) || defined(POOL_L2)

251

// Divide by pool region in case of average or l2 pooling

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

252

res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

253

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

254

255

#if defined(POOL_L2)

256

// Take square root of the result in L2 pooling

257

res = SQRT_OP(res);

258

#endif /* defined(POOL_L2) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

259

260

// Store result

261

*(__global DATA_TYPE *)output.ptr = res;

262

}

263

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

264

/** Performs a pooling function of pool size equal to 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

265

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

266

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

267

* @note In case of average pooling the following information must be passed at compile time:

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

268

* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

269

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

270

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

271

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

272

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

273

* @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

274

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

275

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

276

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

277

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

278

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

279

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

280

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

281

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

282

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

283

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

284

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

285

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

286

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

287

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

288

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

289

*/

290

__kernel void pooling_layer_3(

291

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

292

TENSOR3D_DECLARATION(output))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

293

{

294

// Get pixels pointer

295

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

296

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

297

298

// Load data

299

VEC_DATA_TYPE(DATA_TYPE, 3)

300

data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

301

VEC_DATA_TYPE(DATA_TYPE, 3)

302

data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

303

VEC_DATA_TYPE(DATA_TYPE, 3)

304

data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));

305

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

306

#if defined(POOL_L2)

307

// Raise to power of 2 for L2 Pooling

308

data0 = POW2_OP(data0, 3);

309

data1 = POW2_OP(data1, 3);

310

data2 = POW2_OP(data2, 3);

311

#endif /* defined(POOL_L2) */

312

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

313

// Perform calculations

314

data0 = POOL_OP(data0, data1);

315

data0 = POOL_OP(data0, data2);

316

DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);

317

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

318

#if defined(POOL_AVG) || defined(POOL_L2)

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

319

// Divide by pool region in case of average pooling

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

320

res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

321

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

322

323

#if defined(POOL_L2)

324

// Take square root of the result in L2 pooling

325

res = SQRT_OP(res);

326

#endif /* defined(POOL_L2) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

327

328

// Store result

329

*(__global DATA_TYPE *)output.ptr = res;

330

}

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

331

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

332

#if defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)

333

334

#define CONVERT_OP(data_type) convert_##data_type##4

335

#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)

336

337

VEC_DATA_TYPE(DATA_TYPE, 4)

338

calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,

339

const int pad_x, const int pad_y, const int stride_x, const int stride_y)

340

{

Georgios Pinitas

2017-10-30 15:56:32 +0000

[diff] [blame^]

341

int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;

342

int start_y = get_global_id(1) * stride_y - pad_y;

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

343

const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);

344

const int end_y = min(start_y + pool_size, upper_bound_h);

Georgios Pinitas

2017-10-30 15:56:32 +0000

[diff] [blame^]

345

#if defined(EXCLUDE_PADDING)

346

start_x = max((int4)0, start_x);

347

start_y = max(0, start_y);

348

#endif /* defined(EXCLUDE_PADDING) */

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

349

return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));

350

}

351

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

352

/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

353

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

354

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

355

* @note In case of average pooling the following information must be passed at compile time:

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

356

* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

357

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

358

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

359

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

360

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

361

* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

362

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

363

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

364

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

365

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

366

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

367

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

368

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

369

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

370

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

371

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

372

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

373

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

374

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

375

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

376

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

377

*/

378

__kernel void pooling_layer_3_optimized(

379

TENSOR3D_DECLARATION(input),

380

TENSOR3D_DECLARATION(output))

381

{

382

// Get pixels pointer

383

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

384

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

385

386

VEC_DATA_TYPE(DATA_TYPE, 4)

387

res;

388

389

// Perform pooling 3x3 for 4 output elements

390

POOLING3x3(res, input, output);

391

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

392

#if defined(POOL_AVG) || defined(POOL_L2)

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

393

// Divide by pool region in case of average pooling

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

394

res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

395

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

396

397

#if defined(POOL_L2)

398

// Take square root of the result in L2 pooling

399

res = SQRT_OP(res);

400

#endif /* defined(POOL_L2) */

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

401

402

vstore4(res, 0, (__global DATA_TYPE *)output.ptr);

403

}

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

404

#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

405

406

/** Performs a pooling function of pool size equal to 7.

407

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

408

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

409

* @note In case of average pooling the following information must be passed at compile time:

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

410

* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

411

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

412

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

413

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

414

*

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

415

* @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

416

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

417

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

419

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

421

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

423

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

424

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

425

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

427

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

429

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

431

*/

432

__kernel void pooling_layer_7(

433

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame]

434

TENSOR3D_DECLARATION(output))

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

435

{

436

// Get pixels pointer

437

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

438

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

439

440

// Load data

441

VEC_DATA_TYPE(DATA_TYPE, 8)

442

data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

443

VEC_DATA_TYPE(DATA_TYPE, 8)

444

data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

445

VEC_DATA_TYPE(DATA_TYPE, 8)

446

data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));

447

VEC_DATA_TYPE(DATA_TYPE, 8)

448

data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));

449

VEC_DATA_TYPE(DATA_TYPE, 8)

450

data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));

451

VEC_DATA_TYPE(DATA_TYPE, 8)

452

data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));

453

VEC_DATA_TYPE(DATA_TYPE, 8)

454

data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));

455

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

456

#if defined(POOL_L2)

457

// Raise to power of 2 for L2 Pooling

458

data0 = POW2_OP(data0, 8);

459

data1 = POW2_OP(data1, 8);

460

data2 = POW2_OP(data2, 8);

461

data3 = POW2_OP(data3, 8);

462

data4 = POW2_OP(data4, 8);

463

data5 = POW2_OP(data5, 8);

464

data6 = POW2_OP(data6, 8);

465

#endif /* defined(POOL_L2) */

466

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

467

// Pool operation of all rows

468

data0 = POOL_OP(data0, data1);

469

data2 = POOL_OP(data2, data3);

470

data4 = POOL_OP(data4, data5);

471

data0 = POOL_OP(data0, data2);

472

data4 = POOL_OP(data4, data6);

473

data0 = POOL_OP(data0, data4);

474

475

// Set last element

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

476

#if defined(POOL_AVG) || defined(POOL_L2)

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

477

data0.s7 = 0;

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

478

#else /* defined(POOL_AVG) || defined(POOL_L2) */

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

479

data0.s7 = data0.s6;

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

480

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

481

482

// Reduce result

483

VEC_DATA_TYPE(DATA_TYPE, 4)

484

reduce4 = POOL_OP(data0.s0123, data0.s4567);

485

VEC_DATA_TYPE(DATA_TYPE, 2)

486

reduce2 = POOL_OP(reduce4.s01, reduce4.s23);

487

DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);

488

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

489

#if defined(POOL_AVG) || defined(POOL_L2)

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

490

// Divide by pool region in case of average pooling

steniu01

2017-06-23 17:00:26 +0100

[diff] [blame]

491

res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

492

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

493

494

#if defined(POOL_L2)

495

// Take square root of the result in L2 pooling

496

res = SQRT_OP(res);

497

#endif /* defined(POOL_L2) */

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

498

499

// Store result

500

*(__global DATA_TYPE *)output.ptr = res;

501

}

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

502

503

#if defined(POOL_SIZE)

504

505

// Set the initial value for the pooling operation accordingly with the data type

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

506

#if defined(POOL_AVG) || defined(POOL_L2)

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

507

#define INITIAL_VALUE 0

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

508

#else /* defined(POOL_AVG) || defined(POOL_L2) */

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

509

#ifdef FIXED_POINT_POSITION

510

#define MIN_VAL_EXPAND(type) type##_MIN

511

#define MIN_VAL(type) MIN_VAL_EXPAND(type)

512

#define INITIAL_VALUE MIN_VAL(DATA_TYPE)

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

513

#else // FIXED_POINT_POSITION

514

#if FP16

515

#define INITIAL_VALUE -HALF_MAX

516

#else // FP16

517

#define INITIAL_VALUE -FLT_MAX

518

#endif // FP16

519

#endif // FIXED_POINT_POSITION

#endif // POOL_AVG

/** Performs a pooling function of pool size equal to N

524

*

Georgios Pinitas

13fc22c

2017-10-19 18:35:59 +0100

[diff] [blame]

525

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

526

* @note -DFP16 must be passed at compile time if half float data type is used

527

* @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;

528

* @note In case of average pooling the following information must be passed at compile time:

529

* -DPOOL_AVG must be provided otherwise max pooling will be performed.

530

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

531

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

532

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

533

*

Georgios Pinitas

13fc22c

2017-10-19 18:35:59 +0100

[diff] [blame]

534

* @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

535

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

536

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

537

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

538

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

539

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

540

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

541

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

542

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

543

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

544

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

545

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

546

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

547

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

548

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

549

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

550

*/

551

__kernel void pooling_layer_N(

552

TENSOR3D_DECLARATION(input),

553

TENSOR3D_DECLARATION(output))

554

{

555

// Get pixels pointer

556

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

557

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

558

559

VEC_DATA_TYPE(DATA_TYPE, 8)

560

vdata = INITIAL_VALUE;

561

DATA_TYPE sdata = INITIAL_VALUE;

562

563

// Load data

564

for(int y = 0; y < POOL_SIZE; y++)

565

{

566

int x = 0;

567

for(; x <= ((int)POOL_SIZE - 8); x += 8)

568

{

569

VEC_DATA_TYPE(DATA_TYPE, 8)

570

data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

571

#if defined(POOL_L2)

572

// Raise to power of 2 for L2 Pooling

573

data0 *= data0;

574

#endif /* defined(POOL_L2) */

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

575

vdata = POOL_OP(vdata, data0);

}

// Leftover

for(; x < (int)POOL_SIZE; ++x)

580

{

581

DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

582

#if defined(POOL_L2)

583

// Raise to power of 2 for L2 Pooling

584

data0 *= data0;

585

#endif /* defined(POOL_L2) */

586

sdata = POOL_OP(sdata, data0);

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

}

}

// Reduce result

VEC_DATA_TYPE(DATA_TYPE, 4)

592

reduce4 = POOL_OP(vdata.s0123, vdata.s4567);

593

VEC_DATA_TYPE(DATA_TYPE, 2)

594

reduce2 = POOL_OP(reduce4.s01, reduce4.s23);

595

DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);

596

res = POOL_OP(res, sdata);

597

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

598

#if defined(POOL_AVG) || defined(POOL_L2)

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

599

// Divide by pool region in case of average pooling

Gian Marco Iodice

2017-09-05 13:51:21 +0100

[diff] [blame]

600

res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));

Georgios Pinitas

2017-08-31 14:21:36 +0100

[diff] [blame]

601

#endif /* defined(POOL_AVG) || defined(POOL_L2) */

602

603

#if defined(POOL_L2)

604

// Take square root of the result in L2 pooling

605

res = SQRT_OP(res);

606

#endif /* defined(POOL_L2) */

Gian Marco Iodice