Blame - src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl - ml/ComputeLibrary

2017-07-28 14:46:20 +0100

[diff] [blame]

1

/*

Gian Marco Iodice

ff1fe3e

2021-01-02 09:58:51 +0000

[diff] [blame]

2

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

3

*

4

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

* SOFTWARE.

*/

#include "helpers.h"

steniu01

2017-08-09 16:26:22 +0100

[diff] [blame]

26

#undef CONVERT_SAT

Michalis Spyrou

def665a

2017-08-14 11:26:37 +0100

[diff] [blame]

27

28

#define ADD_OP(a, b) ((a) + (b))

29

#define MUL_OP(a, b) ((a) * (b))

30

#define CONVERT_SAT(a, b) ((a))

31

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

32

#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

33

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

34

#if STRIDE_X == 3

35

#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size

36

#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)

37

#elif STRIDE_X == 2

38

#define INPUT_PIXEL(data_size) extract_input_stride2

39

#elif STRIDE_X == 1

40

#define INPUT_PIXEL(data_size) extract_input_stride1

41

#else /* STRIDE_X not equals 1, 2 or 3 */

42

#error "Only support strides 1, 2 and 3"

43

#endif /* STRIDE_X == 3 */

44

45

/** Extracts a 1D horizontal vector from the input tensor with stride as 1.

46

*

47

* @param[in] input_pixel Pointer to the first pixel.

48

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

49

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

50

*/

51

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)

52

{

53

return vload8(0, input_pixel);

54

}

55

56

/** Extracts a 1D horizontal vector from the input tensor with stride as 2.

57

*

58

* @param[in] input_pixel Pointer to the first pixel.

59

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

60

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

61

*/

62

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)

63

{

64

VEC_DATA_TYPE(DATA_TYPE, 16)

65

temp = vload16(0, input_pixel);

66

return temp.s02468ace;

67

}

68

69

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.

70

*

71

* @param[in] input_pixel Pointer to the first pixel.

72

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

73

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

74

*/

75

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)

76

{

77

VEC_DATA_TYPE(DATA_TYPE, 4)

78

temp1 = vload4(0, input_pixel);

79

VEC_DATA_TYPE(DATA_TYPE, 4)

80

temp2 = vload4(0, input_pixel + 6);

81

VEC_DATA_TYPE(DATA_TYPE, 4)

82

temp3 = vload4(0, input_pixel + 12);

83

VEC_DATA_TYPE(DATA_TYPE, 4)

84

temp4 = vload4(0, input_pixel + 18);

85

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);

86

}

87

88

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.

89

*

90

* @param[in] input_pixel Pointer to the first pixel.

91

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

92

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

93

*/

94

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)

95

{

96

VEC_DATA_TYPE(DATA_TYPE, 8)

97

temp1 = vload8(0, input_pixel);

98

VEC_DATA_TYPE(DATA_TYPE, 8)

99

temp2 = vload8(0, input_pixel + 8);

100

VEC_DATA_TYPE(DATA_TYPE, 8)

101

temp3 = vload8(0, input_pixel + 16);

102

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);

103

}

104

105

/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.

106

*

107

* @param[in] input_pixel Pointer to the first pixel.

108

*

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

109

* @return extracted input values.

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

110

*/

111

inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)

112

{

113

VEC_DATA_TYPE(DATA_TYPE, 16)

114

temp1 = vload16(0, input_pixel);

115

VEC_DATA_TYPE(DATA_TYPE, 16)

116

temp2 = vload16(0, input_pixel + 12);

117

return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);

118

}

119

120

/** This kernel performs a direct convolution to convolve the low three dimensions.

121

*

122

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

123

* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

124

* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1

125

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

126

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

127

*

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

128

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

129

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

130

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

131

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

132

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

133

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

134

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

135

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

136

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

137

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

138

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

139

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

140

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

141

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

142

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

143

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

144

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

145

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

146

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

147

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

148

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

149

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

150

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

151

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

152

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

153

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

154

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

155

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

156

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

157

*/

158

__kernel void direct_convolution1x1(

159

TENSOR3D_DECLARATION(src),

160

TENSOR3D_DECLARATION(dst),

161

TENSOR3D_DECLARATION(weights),

162

#ifdef HAS_BIAS

163

VECTOR_DECLARATION(biases),

164

#endif /* defined(HAS_BIAS) */

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

165

unsigned int weights_stride_w)

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

166

{

167

Image src = CONVERT_TO_IMAGE_STRUCT(src);

168

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

169

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

170

171

#ifdef HAS_BIAS

172

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

173

#endif /* defined(HAS_BIAS) */

174

Michalis Spyrou

def665a

2017-08-14 11:26:37 +0100

[diff] [blame]

175

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

176

values = 0;

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

177

178

const uint z_index = get_global_id(2);

179

180

weights.ptr += z_index * weights_stride_w;

Gian Marco Iodice

744b5ed

2017-10-06 15:44:27 +0100

[diff] [blame]

181

for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

182

{

183

DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;

184

VEC_DATA_TYPE(DATA_TYPE, 8)

185

input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

186

values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

187

src.ptr += src_stride_z;

188

weights.ptr += weights_stride_z;

189

}

190

191

#ifdef HAS_BIAS

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

192

values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

193

#endif /* defined(HAS_BIAS) */

194

Pablo Tello

2018-06-21 15:13:17 +0100

[diff] [blame]

195

vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);

SiCong Li

2017-07-28 14:46:20 +0100

[diff] [blame]

196

}

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame]

197

#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

198

199

#if defined(WEIGHTS_DEPTH)

200

201

#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \

202

({ \

203

acc.s0 = mad(src.s0, weight_value, acc.s0); \

204

acc.s1 = mad(src.s1, weight_value, acc.s1); \

205

acc.s2 = mad(src.s2, weight_value, acc.s2); \

206

acc.s3 = mad(src.s3, weight_value, acc.s3); \

207

})

208

209

/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32

210

*

211

* @note This OpenCL kernel works only with stride_x and stride_y equal to 1

212

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

213

* @note In case biases, -DHAS_BIAS must to be passed at compile

214

*

215

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

216

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

217

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

219

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

221

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

222

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

223

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

224

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

225

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

226

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

227

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

228

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

229

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

230

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Joel Liang

f1f3ebd

2017-11-10 09:59:19 +0800

[diff] [blame]

231

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Gian Marco Iodice

1c8409d

2017-09-06 17:24:25 +0100

[diff] [blame]

232

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

233

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

234

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

235

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

236

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

237

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

238

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

239

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

240

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

241

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

242

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

243

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

244

*/

245

__kernel void direct_convolution1x1_f32_bifrost(

246

TENSOR3D_DECLARATION(src),

247

TENSOR3D_DECLARATION(dst),

248

TENSOR3D_DECLARATION(weights),

249

#ifdef HAS_BIAS

250

VECTOR_DECLARATION(biases),

251

#endif /* defined(HAS_BIAS) */

252

unsigned int weights_stride_w)

253

{

254

// Get the kernel index

255

const int kernel_index = get_global_id(2);

256

257

Image src = CONVERT_TO_IMAGE_STRUCT(src);

258

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

float4 acc0 = 0.0f;

float4 acc1 = 0.0f;

float4 acc2 = 0.0f;

float4 acc3 = 0.0f;

__global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);

266

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

267

268

for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)

269

{

270

// Load the weights

271

float weight = *((__global float *)weights_addr);

272

273

// Load values from row0 of input tensor

274

float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));

275

float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));

276

float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));

277

float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));

278

279

CONVOLUTION1x1_BIFROST(acc0, src0, weight);

280

CONVOLUTION1x1_BIFROST(acc1, src1, weight);

281

CONVOLUTION1x1_BIFROST(acc2, src2, weight);

282

CONVOLUTION1x1_BIFROST(acc3, src3, weight);

283

284

src_addr += src_stride_z;

285

weights_addr += weights_stride_z;

}

#ifdef HAS_BIAS

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

290

291

float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));

acc0.s0 += bias;

acc0.s1 += bias;

acc0.s2 += bias;

acc0.s3 += bias;

acc1.s0 += bias;

acc1.s1 += bias;

acc1.s2 += bias;

acc1.s3 += bias;

acc2.s0 += bias;

acc2.s1 += bias;

acc2.s2 += bias;

acc2.s3 += bias;

acc3.s0 += bias;

acc3.s1 += bias;

acc3.s2 += bias;

acc3.s3 += bias;

#endif /* defined(HAS_BIAS) */

310

311

vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

312

vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

313

vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));

314

vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));

315

}

Pablo Tello