Blame - src/core/CL/cl_kernels/direct_convolution1x1.cl - ml/ComputeLibrary

2017-07-28 14:46:20 +0100

[diff] [blame]

1

/*

Michele Di Giorgio

d9eaf61

2020-07-08 11:12:57 +0100

[diff] [blame]

2

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

26

#undef CONVERT_SAT

Michalis Spyrou

def665a

2017-08-14 11:26:37 +0100

[diff] [blame]

27

28

#define ADD_OP(a, b) ((a) + (b))

29

#define MUL_OP(a, b) ((a) * (b))

30

#define CONVERT_SAT(a, b) ((a))

31

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

32

#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

33

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

34

#if defined(DATA_LAYOUT_NHWC)

35

36

#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))

37

38

/** This kernel performs a direct convolution to convolve the low three dimensions of a tensor with data layout NHWC

39

*

40

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

41

* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32

42

* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1

43

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

44

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

45

*

46

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

47

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

48

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

49

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

50

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

51

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

52

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

53

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

54

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

55

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

56

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

57

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

58

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

59

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

60

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

61

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

62

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

63

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

64

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

65

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

66

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

67

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

68

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

69

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

70

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

71

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

72

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

73

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

74

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

75

*/

76

__kernel void direct_convolution1x1_nhwc(

77

TENSOR3D_DECLARATION(src),

78

TENSOR3D_DECLARATION(dst),

79

TENSOR3D_DECLARATION(weights),

80

#ifdef HAS_BIAS

81

VECTOR_DECLARATION(biases),

82

#endif /* defined(HAS_BIAS) */

83

unsigned int weights_stride_w)

84

{

85

Image src = CONVERT_TO_IMAGE_STRUCT(src);

86

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

87

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

88

89

#ifdef HAS_BIAS

90

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

91

#endif /* defined(HAS_BIAS) */

92

93

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

94

values = 0;

95

const int id0 = get_global_id(0);

96

const int id1 = get_global_id(1);

97

const int id2 = get_global_id(2);

98

weights.ptr += id0 * weights_stride_w;

99

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + id2 * STRIDE_Y * (int)src_stride_z;

100

101

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

102

{

103

DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;

104

#if STRIDE_X == 1

105

VEC_DATA_TYPE(DATA_TYPE, 8)

106

col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(

107

PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),

108

PTR_TO_VALUE(src_addr + 1 * src_stride_y, DATA_TYPE),

109

PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),

110

PTR_TO_VALUE(src_addr + 3 * src_stride_y, DATA_TYPE),

111

PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),

112

PTR_TO_VALUE(src_addr + 5 * src_stride_y, DATA_TYPE),

113

PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),

114

PTR_TO_VALUE(src_addr + 7 * src_stride_y, DATA_TYPE));

115

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

116

VEC_DATA_TYPE(DATA_TYPE, 8)

117

col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(

118

PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),

119

PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),

120

PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),

121

PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),

122

PTR_TO_VALUE(src_addr + 8 * src_stride_y, DATA_TYPE),

123

PTR_TO_VALUE(src_addr + 10 * src_stride_y, DATA_TYPE),

124

PTR_TO_VALUE(src_addr + 12 * src_stride_y, DATA_TYPE),

125

PTR_TO_VALUE(src_addr + 14 * src_stride_y, DATA_TYPE));

126

#else /* STRIDE_X not equals 1 or 2 */

127

#error "STRIDE_X larger than 2 is not supported"

128

#endif /* STRIDE_X == 2 */

129

values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, col0));

130

131

src_addr += src_stride_x;

132

weights.ptr += weights_stride_x;

}

#ifdef HAS_BIAS

values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));

137

#endif /* defined(HAS_BIAS) */

138

139

*((__global DATA_TYPE *)dst.ptr) = values.s0;

140

*((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;

141

*((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;

142

*((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;

143

*((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;

144

*((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;

145

*((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;

146

*((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;

147

}

148

#endif // defined(DATA_LAYOUT_NHWC)

149

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

150

#if STRIDE_X == 3

151

#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size

152

#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)

153

#elif STRIDE_X == 2

154

#define INPUT_PIXEL(data_size) extract_input_stride2

155

#elif STRIDE_X == 1

156

#define INPUT_PIXEL(data_size) extract_input_stride1

157

#else /* STRIDE_X not equals 1, 2 or 3 */

158

#error "Only support strides 1, 2 and 3"

159

#endif /* STRIDE_X == 3 */

160

161

/** Extracts a 1D horizontal vector from the input tensor with stride as 1.

162

*

163

* @param[in] input_pixel Pointer to the first pixel.

164

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

165

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

166

*/

167

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)

168

{

169

return vload8(0, input_pixel);

170

}

171

172

/** Extracts a 1D horizontal vector from the input tensor with stride as 2.

173

*

174

* @param[in] input_pixel Pointer to the first pixel.

175

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

176

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

177

*/

178

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)

179

{

180

VEC_DATA_TYPE(DATA_TYPE, 16)

181

temp = vload16(0, input_pixel);

182

return temp.s02468ace;

183

}

184

185

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.

186

*

187

* @param[in] input_pixel Pointer to the first pixel.

188

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

189

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

190

*/

191

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)

192

{

193

VEC_DATA_TYPE(DATA_TYPE, 4)

194

temp1 = vload4(0, input_pixel);

195

VEC_DATA_TYPE(DATA_TYPE, 4)

196

temp2 = vload4(0, input_pixel + 6);

197

VEC_DATA_TYPE(DATA_TYPE, 4)

198

temp3 = vload4(0, input_pixel + 12);

199

VEC_DATA_TYPE(DATA_TYPE, 4)

200

temp4 = vload4(0, input_pixel + 18);

201

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);

202

}

203

204

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.

205

*

206

* @param[in] input_pixel Pointer to the first pixel.

207

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

208

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

209

*/

210

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)

211

{

212

VEC_DATA_TYPE(DATA_TYPE, 8)

213

temp1 = vload8(0, input_pixel);

214

VEC_DATA_TYPE(DATA_TYPE, 8)

215

temp2 = vload8(0, input_pixel + 8);

216

VEC_DATA_TYPE(DATA_TYPE, 8)

217

temp3 = vload8(0, input_pixel + 16);

218

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);

219

}

220

221

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.

222

*

223

* @param[in] input_pixel Pointer to the first pixel.

224

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

225

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

226

*/

227

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)

228

{

229

VEC_DATA_TYPE(DATA_TYPE, 16)

230

temp1 = vload16(0, input_pixel);

231

VEC_DATA_TYPE(DATA_TYPE, 16)

232

temp2 = vload16(0, input_pixel + 12);

233

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);

234

}

235

236

/** This kernel performs a direct convolution to convolve the low three dimensions.

237

*

238

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

239

* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

240

* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1

241

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

242

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

243

*

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

244

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

245

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

246

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

247

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

248

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

249

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

250

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

251

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

252

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

253

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

254

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

255

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

256

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

257

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

258

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

259

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

260

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

261

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

262

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

263

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

264

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

265

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

266

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

267

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

268

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

269

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

270

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

271

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

272

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

273

*/

274

__kernel void direct_convolution1x1(

275

TENSOR3D_DECLARATION(src),

276

TENSOR3D_DECLARATION(dst),

277

TENSOR3D_DECLARATION(weights),

278

#ifdef HAS_BIAS

279

VECTOR_DECLARATION(biases),

280

#endif /* defined(HAS_BIAS) */

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

281

unsigned int weights_stride_w)

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

282

{

283

Image src = CONVERT_TO_IMAGE_STRUCT(src);

284

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

285

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

286

287

#ifdef HAS_BIAS

288

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

289

#endif /* defined(HAS_BIAS) */

290

Michalis Spyrou

def665a

2017-08-14 11:26:37 +0100

[diff] [blame]

291

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

292

values = 0;

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

293

294

const uint z_index = get_global_id(2);

295

296

weights.ptr += z_index * weights_stride_w;

Gian Marco Iodice

744b5ed

2017-10-06 15:44:27 +0100

[diff] [blame]

297

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

298

{

299

DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;

300

VEC_DATA_TYPE(DATA_TYPE, 8)

301

input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

302

values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

303

src.ptr += src_stride_z;

304

weights.ptr += weights_stride_z;

305

}

306

307

#ifdef HAS_BIAS

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

308

values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

309

#endif /* defined(HAS_BIAS) */

310

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

311

vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

312

}

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

313

#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

314

315

#if defined(WEIGHTS_DEPTH)

316

317

#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \

318

({ \

319

acc.s0 = mad(src.s0, weight_value, acc.s0); \

320

acc.s1 = mad(src.s1, weight_value, acc.s1); \

321

acc.s2 = mad(src.s2, weight_value, acc.s2); \

322

acc.s3 = mad(src.s3, weight_value, acc.s3); \

323

})

324

325

/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32

326

*

327

* @note This OpenCL kernel works only with stride_x and stride_y equal to 1

328

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

329

* @note In case biases, -DHAS_BIAS must to be passed at compile

330

*

331

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

332

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

333

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

334

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

335

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

336

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

337

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

338

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

339

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

340

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

341

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

342

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

343

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

344

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

345

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

346

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

347

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

348

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

349

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

350

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

351

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

352

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

353

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

354

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

355

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

356

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

357

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

358

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

359

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

360

*/

361

__kernel void direct_convolution1x1_f32_bifrost(

362

TENSOR3D_DECLARATION(src),

363

TENSOR3D_DECLARATION(dst),

364

TENSOR3D_DECLARATION(weights),

365

#ifdef HAS_BIAS

366

VECTOR_DECLARATION(biases),

367

#endif /* defined(HAS_BIAS) */

368

unsigned int weights_stride_w)

369

{

370

// Get the kernel index

371

const int kernel_index = get_global_id(2);

372

373

Image src = CONVERT_TO_IMAGE_STRUCT(src);

374

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

float4 acc0 = 0.0f;

float4 acc1 = 0.0f;

float4 acc2 = 0.0f;

float4 acc3 = 0.0f;

__global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);

382

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

383

384

for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)

385

{

386

// Load the weights

387

float weight = *((__global float *)weights_addr);

388

389

// Load values from row0 of input tensor

390

float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));

391

float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));

392

float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));

393

float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));

394

395

CONVOLUTION1x1_BIFROST(acc0, src0, weight);

396

CONVOLUTION1x1_BIFROST(acc1, src1, weight);

397

CONVOLUTION1x1_BIFROST(acc2, src2, weight);

398

CONVOLUTION1x1_BIFROST(acc3, src3, weight);

399

400

src_addr += src_stride_z;

401

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

406

407

float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));

acc0.s0 += bias;

acc0.s1 += bias;

acc0.s2 += bias;

acc0.s3 += bias;

acc1.s0 += bias;

acc1.s1 += bias;

acc1.s2 += bias;

acc1.s3 += bias;

acc2.s0 += bias;

acc2.s1 += bias;

acc2.s2 += bias;

acc2.s3 += bias;

acc3.s0 += bias;

acc3.s1 += bias;

acc3.s2 += bias;

acc3.s3 += bias;

#endif /* defined(HAS_BIAS) */

426

427

vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

428

vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

429

vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));

430

vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));

431

}

Pablo Tello