Blame - src/core/CL/cl_kernels/reduction_operation.cl - ml/ComputeLibrary

2017-08-08 17:42:38 +0100

[diff] [blame]

1

/*

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

2

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

/** Calculate square sum of a vector

27

*

28

* @param[in] input Pointer to the first pixel.

29

*

30

* @return square sum of vector.

31

*/

32

inline DATA_TYPE square_sum(__global const DATA_TYPE *input)

33

{

34

VEC_DATA_TYPE(DATA_TYPE, 16)

35

in = vload16(0, input);

in *= in;

in.s01234567 += in.s89ABCDEF;

40

in.s0123 += in.s4567;

41

in.s01 += in.s23;

42

43

return (in.s0 + in.s1);

44

}

45

46

/** Calculate sum of a vector

47

*

48

* @param[in] input Pointer to the first pixel.

49

*

50

* @return sum of vector.

51

*/

52

inline DATA_TYPE sum(__global const DATA_TYPE *input)

53

{

54

VEC_DATA_TYPE(DATA_TYPE, 16)

55

in = vload16(0, input);

56

57

in.s01234567 += in.s89ABCDEF;

58

in.s0123 += in.s4567;

59

in.s01 += in.s23;

60

61

return (in.s0 + in.s1);

62

}

63

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

64

/** This kernel performs parallel reduction given an operation on x-axis.

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

65

*

66

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

67

* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

68

* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value

69

* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

70

*

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

71

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

72

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

73

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

74

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

75

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

76

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

77

* @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

78

* @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes)

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

79

* @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

80

* @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes)

81

* @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

82

* @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

83

* @param[in] local_sums Local buffer for storing the partial sum

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

84

*/

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

85

__kernel void reduction_operation_x(

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

86

IMAGE_DECLARATION(src),

87

IMAGE_DECLARATION(partial_sum),

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

88

__local DATA_TYPE *local_sums)

89

{

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

90

Image src = CONVERT_TO_IMAGE_STRUCT(src);

91

Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

92

93

unsigned int lsize = get_local_size(0);

94

unsigned int lid = get_local_id(0);

95

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

96

for(unsigned int y = 0; y < get_local_size(1); ++y)

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

97

{

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

98

local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

99

barrier(CLK_LOCAL_MEM_FENCE);

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

100

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

101

// Perform parallel reduction

102

for(unsigned int i = lsize >> 1; i > 0; i >>= 1)

{

if(lid < i)

{

local_sums[lid] += local_sums[lid + i];

107

}

108

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

113

#if defined(MEAN) && defined(WIDTH)

114

if(y == get_local_size(1) - 1)

115

{

116

local_sums[0] /= WIDTH;

117

}

118

#endif /* defined(MEAN) && defined(WIDTH) */

Michalis Spyrou

2018-01-26 15:06:19 +0000

[diff] [blame]

119

((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];

120

}

Michalis Spyrou

2017-08-08 17:42:38 +0100

[diff] [blame]

121

}

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

}

#if defined(WIDTH)

/** This kernel performs reduction on x-axis. (QASYMM8)

126

*

127

* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128

128

*

129

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

130

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

131

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

132

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

133

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt

134

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

135

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

136

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

137

*/

138

__kernel void reduction_operation_quantized_x(

139

VECTOR_DECLARATION(src),

140

VECTOR_DECLARATION(output))

141

{

142

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

143

Vector output = CONVERT_TO_VECTOR_STRUCT(output);

uint res = 0;

for(unsigned int x = 0; x < WIDTH; ++x)

148

{

149

res += *((__global uchar *)vector_offset(&src, x));

}

#if defined(MEAN)

res /= WIDTH;

#endif /* defined(MEAN) */

155

156

// Store result

157

*((__global uchar *)output.ptr) = convert_uchar(res);

158

}

159

#endif /* defined(HEIGHT) */

160

161

#if defined(HEIGHT)

162

/** This kernel performs reduction on y-axis.

163

*

164

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

165

* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128

166

*

167

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

168

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

169

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

170

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

171

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

172

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

173

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt

174

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

175

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

176

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

177

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

178

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

179

*/

180

__kernel void reduction_operation_y(

181

IMAGE_DECLARATION(src),

182

IMAGE_DECLARATION(output))

183

{

184

Image src = CONVERT_TO_IMAGE_STRUCT(src);

185

Image output = CONVERT_TO_IMAGE_STRUCT(output);

186

187

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

188

res = 0;

189

190

for(unsigned int y = 0; y < HEIGHT; ++y)

191

{

Michalis Spyrou

8aaf93e

2018-10-11 17:33:32 +0100

[diff] [blame]

192

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

193

in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));

194

#if defined(SUM_SQUARE)

195

in *= in;

196

#endif // SQRSUM

197

res += in;

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

}

#if defined(MEAN)

res /= HEIGHT;

#endif /* defined(MEAN) */

203

204

// Store result

205

vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);

206

}

207

#endif /* defined(HEIGHT) */

208

209

#if defined(DEPTH)

210

/** This kernel performs reduction on z-axis.

211

*

212

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

213

* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128

214

*

215

* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

216

* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)

217

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)

219

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

221

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

222

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor

223

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt

224

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

225

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

226

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

227

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

228

* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)

229

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

230

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

231

*/

232

__kernel void reduction_operation_z(

233

TENSOR3D_DECLARATION(input),

234

TENSOR3D_DECLARATION(output))

235

{

236

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

237

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

238

239

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

240

res = 0;

241

242

for(unsigned int z = 0; z < DEPTH; ++z)

243

{

Michalis Spyrou

8aaf93e

2018-10-11 17:33:32 +0100

[diff] [blame]

244

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

245

in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));

246

#if defined(SUM_SQUARE)

247

in *= in;

248

#endif // SQRSUM

249

res += in;

Michalis Spyrou

2018-10-05 14:49:28 +0100

[diff] [blame]

}

#if defined(MEAN)

res /= DEPTH;

#endif /* defined(MEAN) */

255

256

// Store result

257

vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);

258

}

259

#endif /* defined(DEPTH) */

260

261

#if defined(BATCH) && defined(DEPTH)

262

/** This kernel performs reduction on w-axis.

263

*

264

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

265

* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128

266

* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128

267

*

268

* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32

269

* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)

270

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

271

* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)

272

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

273

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

274

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

275

* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)

276

* @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)

277

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor

278

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt

279

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

280

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

281

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

282

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

283

* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)

284

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

285

* @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)

286

* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)

287

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

288

*/

289

__kernel void reduction_operation_w(

290

TENSOR4D_DECLARATION(input),

291

TENSOR4D_DECLARATION(output))

292

{

293

Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);

294

Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);

295

296

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

297

res = 0;

298

299

for(unsigned int w = 0; w < BATCH; ++w)

300

{

Michalis Spyrou

8aaf93e

2018-10-11 17:33:32 +0100

[diff] [blame]

301

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)

302

in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));

303

#if defined(SUM_SQUARE)

304

in *= in;

305

#endif // SQRSUM

306

res += in;

Michalis Spyrou