Blame - src/core/CL/cl_kernels/arg_min_max.cl - ml/ComputeLibrary

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

1

/*

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

2

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

Michalis Spyrou

2020-01-17 11:27:49 +0000

[diff] [blame^]

26

#if defined(FLOAT_DATA_TYPE)

27

#define ISGREATER(x, y) isgreater(x, y)

28

#define ISLESS(x, y) isless(x, y)

29

#else // !FLOAT_DATA_TYPE

30

#if defined(WIDTH)

31

#define ISGREATER(x, y) (x > y) ? 1 : 0

32

#define ISLESS(x, y) (x < y) ? 1 : 0

33

#else // !defined(WIDTH)

34

#define ISGREATER(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)

35

#define ISLESS(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)

36

#endif // defined(WIDTH)

37

#endif // defined(FLOAT_DATA_TYPE)

38

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

39

#if defined(ARG_MAX)

40

#define CONDITION_TO_USE(x, y) ISGREATER(x, y)

41

#elif defined(ARG_MIN)

42

#define CONDITION_TO_USE(x, y) ISLESS(x, y)

43

#else // !(defined(ARG_MAX) || defined(ARG_MIN))

44

#error "Unsupported reduction operation!"

45

#endif // defined(ARG_MAX)

46

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

47

#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

48

#if defined(WIDTH)

49

#if defined(ARG_MIN)

50

#if defined(PREV_OUTPUT)

51

/** Find index minimum value of a vector

52

*

53

* @param[in] input Pointer to the first value.

54

*

55

* @return index of the vector.

56

*/

57

inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)

58

{

59

int end_elem = (x_idx + 1) * 16;

60

if(end_elem > WIDTH)

61

{

62

end_elem = WIDTH - x_idx * 16;

63

}

64

DATA_TYPE_OUTPUT res = prev_res[0];

65

for(int x_v = 1; x_v < end_elem; ++x_v)

66

{

67

res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));

}

return res;

}

#else // !defined(PREV_OUTPUT)

72

/** Find index minimum value of a vector

73

*

74

* @param[in] input Pointer to the first value.

75

*

76

* @return index of the vector.

77

*/

78

inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)

79

{

80

#if WIDTH < 16

81

DATA_TYPE_OUTPUT res = 0;

82

for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)

83

{

84

res = select(res, x_v, *(input + x_v) < * (input + res));

}

return res;

#else // WIDTH >= 16

int x_elem = x_idx * 16;

89

const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);

90

x_elem -= x_goback;

91

92

VEC_DATA_TYPE(DATA_TYPE, 16)

93

in = vload16(0, input - x_goback);

94

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

95

res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

96

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

97

VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

98

idx_sel = (in.s01234567 <= in.s89abcdef);

99

in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);

100

res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));

101

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

102

idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

103

in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);

104

res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));

105

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

106

idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

107

in.s01 = select(in.s23, in.s01, idx_sel.s01);

108

res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));

109

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

110

idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

111

res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));

112

113

return res.s0 + x_elem;

114

#endif // WIDTH < 16

115

}

116

#endif // defined(PREV_OUTPUT)

117

#endif // defined(ARG_MIN)

118

#if defined(ARG_MAX)

119

#if defined(PREV_OUTPUT)

120

/** Find index maximum value of a vector

121

*

122

* @param[in] input Pointer to the first value.

123

*

124

* @return index of the vector.

125

*/

126

inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)

127

{

128

int end_elem = (x_idx + 1) * 16;

129

if(end_elem > WIDTH)

130

{

131

end_elem = WIDTH - x_idx * 16;

132

}

133

DATA_TYPE_OUTPUT res = prev_res[0];

134

for(int x_v = 1; x_v < end_elem; ++x_v)

135

{

136

res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));

}

return res;

}

#else // !defined(PREV_OUTPUT)

141

/** Find index maximum value of a vector

142

*

143

* @param[in] input Pointer to the first value.

144

*

145

* @return index of the vector.

146

*/

147

inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)

148

{

149

#if WIDTH < 16

150

DATA_TYPE_OUTPUT res = 0;

151

for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)

152

{

153

res = select(res, x_v, *(input + x_v) > *(input + res));

}

return res;

#else // WIDTH >= 16

int x_elem = x_idx * 16;

158

const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);

159

x_elem -= x_goback;

160

161

VEC_DATA_TYPE(DATA_TYPE, 16)

162

in = vload16(0, input - x_goback);

163

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

164

res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

165

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

166

VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

167

idx_sel = (in.s01234567 >= in.s89abcdef);

168

in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);

169

res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));

170

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

171

idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

172

in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);

173

res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));

174

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

175

idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

176

in.s01 = select(in.s23, in.s01, idx_sel.s01);

177

res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));

178

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

179

idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

180

res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));

181

182

return res.s0 + x_elem;

183

#endif // WIDTH < 16

184

}

185

#endif // defined(PREV_OUTPUT)

186

#endif // defined(ARG_MAX)

187

188

/** This kernel performs parallel reduction given an operation on x-axis.

189

*

190

* @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT

191

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

192

* @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint

193

* @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax

194

* @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin

195

*

196

* @param[in] src_ptr Pointer to the source tensor. Supported data types: S32/F16/F32

197

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

198

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

199

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

200

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

201

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

202

* @param[in] prev_res_ptr (Optional) Pointer to previous results tensor. Supported data types: U32/S32

203

* @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X dimension (in bytes)

204

* @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)

205

* @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y dimension (in bytes)

206

* @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)

207

* @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element in the previous results tensor

208

* @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: U32/S32

209

* @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)

210

* @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)

211

* @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)

212

* @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)

213

* @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor

214

* @param[in] local_results Local buffer for storing the partial result

215

*/

216

__kernel void arg_min_max_x(

217

IMAGE_DECLARATION(src),

218

#if defined(PREV_OUTPUT)

219

IMAGE_DECLARATION(prev_res),

220

#endif // defined(PREV_OUTPUT)

221

IMAGE_DECLARATION(partial_res),

222

__local DATA_TYPE_OUTPUT *local_results)

223

{

224

#if defined(PREV_OUTPUT)

225

Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);

226

Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);

227

#else // !defined(PREV_OUTPUT)

228

Image src = CONVERT_TO_IMAGE_STRUCT(src);

229

#endif // defined(PREV_OUTPUT)

230

Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);

231

232

unsigned int lsize = get_local_size(0);

233

unsigned int lid = get_local_id(0);

234

235

const uint x_idx = get_global_id(0);

236

const uint y_idx = get_global_id(1);

237

const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);

238

239

for(unsigned int y = 0; y < get_local_size(1); ++y)

240

{

241

#if defined(ARG_MAX)

242

#if defined(PREV_OUTPUT)

243

local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);

244

#else // !defined(PREV_OUTPUT)

245

local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);

246

#endif // defined(PREV_OUTPUT)

247

#else // defined(ARG_MIN)

248

#if defined(PREV_OUTPUT)

249

local_results[lid] = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);

250

#else // !defined(PREV_OUTPUT)

251

local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);

252

#endif // defined(PREV_OUTPUT)

253

#endif // defined(ARG_MAX) || defined(ARG_MIN)

254

255

barrier(CLK_LOCAL_MEM_FENCE);

256

257

// Perform parallel reduction

258

for(unsigned int i = lsize >> 1; i > 0; i >>= 1)

{

if(lid < i)

{

DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);

263

DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);

264

#if defined(ARG_MAX)

265

local_results[lid] = select(

266

local_results[lid],

267

local_results[lid + i],

268

((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));

269

#else // defined(ARG_MIN)

270

local_results[lid] = select(

271

local_results[lid],

272

local_results[lid + i],

273

((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));

274

#endif // defined(ARG_MAX) || defined(ARG_MIN)

275

}

276

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0)

{

((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];

}

}

}

#endif // defined(WIDTH)

286

287

#if defined(HEIGHT)

288

/** This kernel performs reduction on y-axis.

289

*

290

* @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

291

* @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

292

* @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

293

* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128

294

*

295

* @param[in] src_ptr Pointer to the source tensor. Supported data types: S32/F16/F32

296

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

297

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

298

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

299

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

300

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

301

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: U32/S32

302

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

303

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

304

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

305

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

306

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

307

*/

308

__kernel void arg_min_max_y(

309

IMAGE_DECLARATION(src),

310

IMAGE_DECLARATION(output))

311

{

312

Image src = CONVERT_TO_IMAGE_STRUCT(src);

313

Image output = CONVERT_TO_IMAGE_STRUCT(output);

314

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

315

VEC_DATA_TYPE(DATA_TYPE, 16)

316

res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

317

318

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

319

indx = 0;

320

for(unsigned int y = 1; y < HEIGHT; ++y)

321

{

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

322

VEC_DATA_TYPE(DATA_TYPE, 16)

323

in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

324

325

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

326

cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));

327

indx = select(indx, y, cond_conv);

328

res = select(res, in, CONDITION_TO_USE(in, res));

}

// Store result

vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);

333

}

334

#endif // defined(HEIGHT)

335

336

#if defined(DEPTH)

337

/** This kernel performs reduction on z-axis.

338

*

339

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

340

* @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

341

* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128

342

*

343

* @param[in] input_ptr Pointer to the source tensor. Supported data types: S32/F16/F32

344

* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)

345

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

346

* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)

347

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

348

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

349

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

350

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor

351

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: U32/S32

352

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

353

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

354

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

355

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

356

* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)

357

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

358

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

359

*/

360

__kernel void arg_min_max_z(

361

TENSOR3D_DECLARATION(input),

362

TENSOR3D_DECLARATION(output))

363

{

364

Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);

365

Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);

366

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

367

VEC_DATA_TYPE(DATA_TYPE, 16)

368

res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

369

370

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

371

indx = 0;

372

for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)

373

{

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

374

VEC_DATA_TYPE(DATA_TYPE, 16)

375

in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

376

377

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

378

cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));

379

indx = select(indx, z, cond_conv);

380

res = select(res, in, CONDITION_TO_USE(in, res));

}

// Store result

vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);

385

}

386

#endif /* defined(DEPTH) */

387

388

#if defined(BATCH) && defined(DEPTH)

389

/** This kernel performs reduction on w-axis.

390

*

391

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

392

* @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

393

* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128

394

* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128

395

*

396

* @param[in] input_ptr Pointer to the source tensor. Supported data types: S32/F16/F32

397

* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)

398

* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)

399

* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)

400

* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)

401

* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)

402

* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)

403

* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)

404

* @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)

405

* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor

406

* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: U32/S32

407

* @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)

408

* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)

409

* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)

410

* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)

411

* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)

412

* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)

413

* @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)

414

* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)

415

* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor

416

*/

417

__kernel void arg_min_max_w(

418

TENSOR4D_DECLARATION(input),

419

TENSOR4D_DECLARATION(output))

420

{

421

Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);

422

Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);

423

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

424

VEC_DATA_TYPE(DATA_TYPE, 16)

425

res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

426

427

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

428

indx = 0;

429

for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)

430

{

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

431

VEC_DATA_TYPE(DATA_TYPE, 16)

432

in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE, 16));

Manuel Bottini

7b9998d

2019-10-21 17:59:07 +0100

[diff] [blame]

433

434

VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)

435

cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));

436

indx = select(indx, w, cond_conv);

437

res = select(res, in, CONDITION_TO_USE(in, res));

}

// Store result

vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);

442

}

443

#endif /* defined(BATCH) && defined(DEPTH) */

Michalis Spyrou

7317e39

2020-01-17 11:27:49 +0000

[diff] [blame^]

444

#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */