Blame - src/core/CL/cl_kernels/pooling_layer.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

26

#ifdef POOL_AVG

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

27

#define POOL_OP(x, y) ((x) + (y))

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

28

#else /* POOL_AVG */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

29

#define POOL_OP(x, y) (fmax((x), (y)))

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

30

#endif /* POOL_AVG */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

31

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

32

#if STRIDE_X == 1

33

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)

34

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

35

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)

36

#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */

37

#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)

38

#endif /* STRIDE_X == 3 */

39

40

#define CONVERT_OP(data_type) convert_##data_type##4

41

#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)

42

43

#define POOLING3x3_STRIDE1(res, input, output) \

44

({ \

45

VEC_DATA_TYPE(DATA_TYPE, 4) \

46

data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

47

VEC_DATA_TYPE(DATA_TYPE, 2) \

48

data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \

49

VEC_DATA_TYPE(DATA_TYPE, 4) \

50

data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

51

VEC_DATA_TYPE(DATA_TYPE, 2) \

52

data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \

53

VEC_DATA_TYPE(DATA_TYPE, 4) \

54

data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

55

VEC_DATA_TYPE(DATA_TYPE, 2) \

56

data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \

57

\

58

VEC_DATA_TYPE(DATA_TYPE, 8) \

59

values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \

60

VEC_DATA_TYPE(DATA_TYPE, 4) \

61

values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \

62

VEC_DATA_TYPE(DATA_TYPE, 8) \

63

values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \

64

VEC_DATA_TYPE(DATA_TYPE, 4) \

65

values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \

66

VEC_DATA_TYPE(DATA_TYPE, 8) \

67

values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \

68

VEC_DATA_TYPE(DATA_TYPE, 4) \

69

values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \

70

\

71

values00 = POOL_OP(values00, values10); \

72

values01 = POOL_OP(values01, values11); \

73

values00 = POOL_OP(values00, values20); \

74

values01 = POOL_OP(values01, values21); \

75

\

76

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \

77

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \

78

})

79

80

#define POOLING3x3_STRIDE2(res, input, output) \

81

({ \

82

VEC_DATA_TYPE(DATA_TYPE, 8) \

83

data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

84

DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \

85

VEC_DATA_TYPE(DATA_TYPE, 8) \

86

data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

87

DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \

88

VEC_DATA_TYPE(DATA_TYPE, 8) \

89

data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

90

DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \

91

\

92

VEC_DATA_TYPE(DATA_TYPE, 8) \

93

values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \

94

VEC_DATA_TYPE(DATA_TYPE, 4) \

95

values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \

96

VEC_DATA_TYPE(DATA_TYPE, 8) \

97

values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \

98

VEC_DATA_TYPE(DATA_TYPE, 4) \

99

values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \

100

VEC_DATA_TYPE(DATA_TYPE, 8) \

101

values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \

102

VEC_DATA_TYPE(DATA_TYPE, 4) \

103

values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \

104

\

105

values00 = POOL_OP(values00, values10); \

106

values01 = POOL_OP(values01, values11); \

107

values00 = POOL_OP(values00, values20); \

108

values01 = POOL_OP(values01, values21); \

109

\

110

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \

111

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \

112

})

113

114

#define POOLING3x3_STRIDE3(res, input, output) \

115

({ \

116

VEC_DATA_TYPE(DATA_TYPE, 8) \

117

data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \

118

VEC_DATA_TYPE(DATA_TYPE, 4) \

119

data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \

120

VEC_DATA_TYPE(DATA_TYPE, 8) \

121

data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \

122

VEC_DATA_TYPE(DATA_TYPE, 4) \

123

data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \

124

VEC_DATA_TYPE(DATA_TYPE, 8) \

125

data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \

126

VEC_DATA_TYPE(DATA_TYPE, 4) \

127

data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \

128

\

129

data00 = POOL_OP(data00, data10); \

130

data01 = POOL_OP(data01, data11); \

131

data00 = POOL_OP(data00, data20); \

132

data01 = POOL_OP(data01, data21); \

133

\

134

res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \

135

res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \

136

})

137

138

DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,

139

const int pad_x, const int pad_y, const int stride_x, const int stride_y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

140

{

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

141

const int start_x = get_global_id(0) * stride_x - pad_x;

142

const int start_y = get_global_id(1) * stride_y - pad_y;

143

const int end_x = min(start_x + pool_size, upper_bound_w);

144

const int end_y = min(start_y + pool_size, upper_bound_h);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

145

return 1.f / ((end_y - start_y) * (end_x - start_x));

146

}

147

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

148

VEC_DATA_TYPE(DATA_TYPE, 4)

149

calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,

150

const int pad_x, const int pad_y, const int stride_x, const int stride_y)

151

{

152

const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;

153

const int start_y = get_global_id(1) * stride_y - pad_y;

154

const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);

155

const int end_y = min(start_y + pool_size, upper_bound_h);

156

return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));

157

}

158

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

159

/** Performs a pooling function of pool size equal to 2.

160

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

161

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;

162

* @note In case of average pooling the following information must be passed at compile time:

163

* -DPOOL_AVG must be provided otherwise max pooling will be performed.

164

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

165

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

166

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

167

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

168

* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

169

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

170

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

171

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

172

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

173

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

174

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

175

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

176

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

177

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

178

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

179

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

180

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

181

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

182

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

183

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

184

*/

185

__kernel void pooling_layer_2(

186

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

187

TENSOR3D_DECLARATION(output))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

188

{

189

// Get pixels pointer

190

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

191

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

192

193

// Load data

194

VEC_DATA_TYPE(DATA_TYPE, 2)

195

data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

196

VEC_DATA_TYPE(DATA_TYPE, 2)

197

data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

198

199

// Perform calculations

200

data0 = POOL_OP(data0, data1);

201

DATA_TYPE res = POOL_OP(data0.s0, data0.s1);

202

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

203

// Divide by pool region in case of average pooling

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

204

#ifdef POOL_AVG

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

205

res *= calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

206

#endif /* POOL_AVG */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

207

208

// Store result

209

*(__global DATA_TYPE *)output.ptr = res;

210

}

211

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

212

/** Performs a pooling function of pool size equal to 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

213

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

214

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;

215

* @note In case of average pooling the following information must be passed at compile time:

216

* -DPOOL_AVG must be provided otherwise max pooling will be performed.

217

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

218

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

219

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

220

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

221

* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

222

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

223

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

224

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

225

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

226

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

227

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

228

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

229

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

230

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

231

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

232

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

233

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

234

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

235

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

236

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

237

*/

238

__kernel void pooling_layer_3(

239

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

240

TENSOR3D_DECLARATION(output))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

241

{

242

// Get pixels pointer

243

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

244

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

245

246

// Load data

247

VEC_DATA_TYPE(DATA_TYPE, 3)

248

data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

249

VEC_DATA_TYPE(DATA_TYPE, 3)

250

data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

251

VEC_DATA_TYPE(DATA_TYPE, 3)

252

data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));

253

254

// Perform calculations

255

data0 = POOL_OP(data0, data1);

256

data0 = POOL_OP(data0, data2);

257

DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);

258

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

259

// Divide by pool region in case of average pooling

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

260

#ifdef POOL_AVG

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

261

res *= calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);

262

#endif //POOL_AVG

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

263

264

// Store result

265

*(__global DATA_TYPE *)output.ptr = res;

266

}

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

267

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

268

#if defined(POOLING3x3)

269

/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

270

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

271

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;

272

* @note In case of average pooling the following information must be passed at compile time:

273

* -DPOOL_AVG must be provided otherwise max pooling will be performed.

274

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

275

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

276

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

277

*

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

278

* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

279

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

280

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

281

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

282

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

283

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

284

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

285

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

286

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

287

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

288

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

289

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

290

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

291

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

292

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

293

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

294

*/

295

__kernel void pooling_layer_3_optimized(

296

TENSOR3D_DECLARATION(input),

297

TENSOR3D_DECLARATION(output))

298

{

299

// Get pixels pointer

300

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

301

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

302

303

VEC_DATA_TYPE(DATA_TYPE, 4)

304

res;

305

306

// Perform pooling 3x3 for 4 output elements

307

POOLING3x3(res, input, output);

308

309

// Divide by pool region in case of average pooling

310

#ifdef POOL_AVG

311

res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);

312

#endif // POOL_AVG

313

314

vstore4(res, 0, (__global DATA_TYPE *)output.ptr);

315

}

316

#endif // defined(POOLING3x3)

317

318

/** Performs a pooling function of pool size equal to 7.

319

*

320

* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;

321

* @note In case of average pooling the following information must be passed at compile time:

322

* -DPOOL_AVG must be provided otherwise max pooling will be performed.

323

* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)

324

* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions

325

* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension

326

*

327

* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32

328

* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)

329

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

330

* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)

331

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

332

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

333

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

334

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image

335

* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr

336

* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)

337

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

338

* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)

339

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

340

* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)

341

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

342

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

343

*/

344

__kernel void pooling_layer_7(

345

TENSOR3D_DECLARATION(input),

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

346

TENSOR3D_DECLARATION(output))

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

347

{

348

// Get pixels pointer

349

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

350

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

351

352

// Load data

353

VEC_DATA_TYPE(DATA_TYPE, 8)

354

data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));

355

VEC_DATA_TYPE(DATA_TYPE, 8)

356

data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));

357

VEC_DATA_TYPE(DATA_TYPE, 8)

358

data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));

359

VEC_DATA_TYPE(DATA_TYPE, 8)

360

data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));

361

VEC_DATA_TYPE(DATA_TYPE, 8)

362

data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));

363

VEC_DATA_TYPE(DATA_TYPE, 8)

364

data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));

365

VEC_DATA_TYPE(DATA_TYPE, 8)

366

data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));

367

368

// Pool operation of all rows

369

data0 = POOL_OP(data0, data1);

370

data2 = POOL_OP(data2, data3);

371

data4 = POOL_OP(data4, data5);

372

data0 = POOL_OP(data0, data2);

373

data4 = POOL_OP(data4, data6);

374

data0 = POOL_OP(data0, data4);

// Set last element

#ifdef POOL_AVG

data0.s7 = 0;

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

379

#else /* POOL_AVG */

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

380

data0.s7 = data0.s6;

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

381

#endif /* POOL_AVG */

Georgios Pinitas

2017-06-19 16:11:53 +0100

[diff] [blame]

382

383

// Reduce result

384

VEC_DATA_TYPE(DATA_TYPE, 4)

385

reduce4 = POOL_OP(data0.s0123, data0.s4567);

386

VEC_DATA_TYPE(DATA_TYPE, 2)

387

reduce2 = POOL_OP(reduce4.s01, reduce4.s23);

388

DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);

389

390

// Divide by pool region in case of average pooling

391

#ifdef POOL_AVG

Gian Marco Iodice

2017-08-02 13:19:48 +0100

[diff] [blame^]

392

res *= calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

393

#endif /* POOL_AVG */

Georgios Pinitas